update eigen to 3.3.9

This commit is contained in:
alemuntoni 2021-06-15 12:27:04 +02:00
parent 4b8f73d81c
commit 8dc26dbe93
155 changed files with 2860 additions and 1893 deletions

View File

@ -9,6 +9,7 @@
#define EIGEN_CHOLESKY_MODULE_H #define EIGEN_CHOLESKY_MODULE_H
#include "Core" #include "Core"
#include "Jacobi"
#include "src/Core/util/DisableStupidWarnings.h" #include "src/Core/util/DisableStupidWarnings.h"
@ -31,7 +32,11 @@
#include "src/Cholesky/LLT.h" #include "src/Cholesky/LLT.h"
#include "src/Cholesky/LDLT.h" #include "src/Cholesky/LDLT.h"
#ifdef EIGEN_USE_LAPACKE #ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h" #include "src/misc/lapacke.h"
#endif
#include "src/Cholesky/LLT_LAPACKE.h" #include "src/Cholesky/LLT_LAPACKE.h"
#endif #endif

View File

@ -14,6 +14,22 @@
// first thing Eigen does: stop the compiler from committing suicide // first thing Eigen does: stop the compiler from committing suicide
#include "src/Core/util/DisableStupidWarnings.h" #include "src/Core/util/DisableStupidWarnings.h"
#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
#define EIGEN_CUDACC __CUDACC__
#endif
#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
#endif
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
#elif defined(__CUDACC_VER__)
#define EIGEN_CUDACC_VER __CUDACC_VER__
#else
#define EIGEN_CUDACC_VER 0
#endif
// Handle NVCC/CUDA/SYCL // Handle NVCC/CUDA/SYCL
#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__) #if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
// Do not try asserts on CUDA and SYCL! // Do not try asserts on CUDA and SYCL!
@ -37,9 +53,9 @@
#endif #endif
#define EIGEN_DEVICE_FUNC __host__ __device__ #define EIGEN_DEVICE_FUNC __host__ __device__
// We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro
// works properly on the device side // works properly on the device side
#include <math_functions.hpp> #include <cuda_runtime.h>
#else #else
#define EIGEN_DEVICE_FUNC #define EIGEN_DEVICE_FUNC
#endif #endif
@ -155,6 +171,9 @@
#ifdef __AVX512DQ__ #ifdef __AVX512DQ__
#define EIGEN_VECTORIZE_AVX512DQ #define EIGEN_VECTORIZE_AVX512DQ
#endif #endif
#ifdef __AVX512ER__
#define EIGEN_VECTORIZE_AVX512ER
#endif
#endif #endif
// include files // include files
@ -229,7 +248,7 @@
#if defined __CUDACC__ #if defined __CUDACC__
#define EIGEN_VECTORIZE_CUDA #define EIGEN_VECTORIZE_CUDA
#include <vector_types.h> #include <vector_types.h>
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 #if EIGEN_CUDACC_VER >= 70500
#define EIGEN_HAS_CUDA_FP16 #define EIGEN_HAS_CUDA_FP16
#endif #endif
#endif #endif
@ -260,7 +279,10 @@
#include <cmath> #include <cmath>
#include <cassert> #include <cassert>
#include <functional> #include <functional>
#include <iosfwd> #include <sstream>
#ifndef EIGEN_NO_IO
#include <iosfwd>
#endif
#include <cstring> #include <cstring>
#include <string> #include <string>
#include <limits> #include <limits>
@ -321,12 +343,16 @@ inline static const char *SimdInstructionSetsInUse(void) {
#error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information #error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information
#endif #endif
namespace Eigen {
// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to // we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
// ensure QNX/QCC support // ensure QNX/QCC support
using std::size_t; using std::size_t;
// gcc 4.6.0 wants std:: for ptrdiff_t // gcc 4.6.0 wants std:: for ptrdiff_t
using std::ptrdiff_t; using std::ptrdiff_t;
}
/** \defgroup Core_Module Core module /** \defgroup Core_Module Core module
* This is the main module of Eigen providing dense matrix and vector support * This is the main module of Eigen providing dense matrix and vector support
* (both fixed and dynamic size) with all the features corresponding to a BLAS library * (both fixed and dynamic size) with all the features corresponding to a BLAS library
@ -348,10 +374,13 @@ using std::ptrdiff_t;
#include "src/Core/MathFunctions.h" #include "src/Core/MathFunctions.h"
#include "src/Core/GenericPacketMath.h" #include "src/Core/GenericPacketMath.h"
#include "src/Core/MathFunctionsImpl.h" #include "src/Core/MathFunctionsImpl.h"
#include "src/Core/arch/Default/ConjHelper.h"
#if defined EIGEN_VECTORIZE_AVX512 #if defined EIGEN_VECTORIZE_AVX512
#include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/MathFunctions.h"
#include "src/Core/arch/AVX/PacketMath.h" #include "src/Core/arch/AVX/PacketMath.h"
#include "src/Core/arch/AVX/MathFunctions.h"
#include "src/Core/arch/AVX512/PacketMath.h" #include "src/Core/arch/AVX512/PacketMath.h"
#include "src/Core/arch/AVX512/MathFunctions.h" #include "src/Core/arch/AVX512/MathFunctions.h"
#elif defined EIGEN_VECTORIZE_AVX #elif defined EIGEN_VECTORIZE_AVX
@ -363,6 +392,7 @@ using std::ptrdiff_t;
#include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX/MathFunctions.h"
#include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX/Complex.h"
#include "src/Core/arch/AVX/TypeCasting.h" #include "src/Core/arch/AVX/TypeCasting.h"
#include "src/Core/arch/SSE/TypeCasting.h"
#elif defined EIGEN_VECTORIZE_SSE #elif defined EIGEN_VECTORIZE_SSE
#include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/SSE/MathFunctions.h"
@ -405,6 +435,7 @@ using std::ptrdiff_t;
// on CUDA devices // on CUDA devices
#include "src/Core/arch/CUDA/Complex.h" #include "src/Core/arch/CUDA/Complex.h"
#include "src/Core/IO.h"
#include "src/Core/DenseCoeffsBase.h" #include "src/Core/DenseCoeffsBase.h"
#include "src/Core/DenseBase.h" #include "src/Core/DenseBase.h"
#include "src/Core/MatrixBase.h" #include "src/Core/MatrixBase.h"
@ -452,7 +483,6 @@ using std::ptrdiff_t;
#include "src/Core/Redux.h" #include "src/Core/Redux.h"
#include "src/Core/Visitor.h" #include "src/Core/Visitor.h"
#include "src/Core/Fuzzy.h" #include "src/Core/Fuzzy.h"
#include "src/Core/IO.h"
#include "src/Core/Swap.h" #include "src/Core/Swap.h"
#include "src/Core/CommaInitializer.h" #include "src/Core/CommaInitializer.h"
#include "src/Core/GeneralProduct.h" #include "src/Core/GeneralProduct.h"

View File

@ -10,14 +10,14 @@
#include "Core" #include "Core"
#include "src/Core/util/DisableStupidWarnings.h"
#include "Cholesky" #include "Cholesky"
#include "Jacobi" #include "Jacobi"
#include "Householder" #include "Householder"
#include "LU" #include "LU"
#include "Geometry" #include "Geometry"
#include "src/Core/util/DisableStupidWarnings.h"
/** \defgroup Eigenvalues_Module Eigenvalues module /** \defgroup Eigenvalues_Module Eigenvalues module
* *
* *
@ -45,7 +45,11 @@
#include "src/Eigenvalues/GeneralizedEigenSolver.h" #include "src/Eigenvalues/GeneralizedEigenSolver.h"
#include "src/Eigenvalues/MatrixBaseEigenvalues.h" #include "src/Eigenvalues/MatrixBaseEigenvalues.h"
#ifdef EIGEN_USE_LAPACKE #ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h" #include "src/misc/lapacke.h"
#endif
#include "src/Eigenvalues/RealSchur_LAPACKE.h" #include "src/Eigenvalues/RealSchur_LAPACKE.h"
#include "src/Eigenvalues/ComplexSchur_LAPACKE.h" #include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h" #include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"

View File

@ -10,12 +10,12 @@
#include "Core" #include "Core"
#include "src/Core/util/DisableStupidWarnings.h"
#include "SVD" #include "SVD"
#include "LU" #include "LU"
#include <limits> #include <limits>
#include "src/Core/util/DisableStupidWarnings.h"
/** \defgroup Geometry_Module Geometry module /** \defgroup Geometry_Module Geometry module
* *
* This module provides support for: * This module provides support for:

View File

@ -28,7 +28,11 @@
#include "src/LU/FullPivLU.h" #include "src/LU/FullPivLU.h"
#include "src/LU/PartialPivLU.h" #include "src/LU/PartialPivLU.h"
#ifdef EIGEN_USE_LAPACKE #ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h" #include "src/misc/lapacke.h"
#endif
#include "src/LU/PartialPivLU_LAPACKE.h" #include "src/LU/PartialPivLU_LAPACKE.h"
#endif #endif
#include "src/LU/Determinant.h" #include "src/LU/Determinant.h"

View File

@ -10,12 +10,12 @@
#include "Core" #include "Core"
#include "src/Core/util/DisableStupidWarnings.h"
#include "Cholesky" #include "Cholesky"
#include "Jacobi" #include "Jacobi"
#include "Householder" #include "Householder"
#include "src/Core/util/DisableStupidWarnings.h"
/** \defgroup QR_Module QR module /** \defgroup QR_Module QR module
* *
* *
@ -36,7 +36,11 @@
#include "src/QR/ColPivHouseholderQR.h" #include "src/QR/ColPivHouseholderQR.h"
#include "src/QR/CompleteOrthogonalDecomposition.h" #include "src/QR/CompleteOrthogonalDecomposition.h"
#ifdef EIGEN_USE_LAPACKE #ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h" #include "src/misc/lapacke.h"
#endif
#include "src/QR/HouseholderQR_LAPACKE.h" #include "src/QR/HouseholderQR_LAPACKE.h"
#include "src/QR/ColPivHouseholderQR_LAPACKE.h" #include "src/QR/ColPivHouseholderQR_LAPACKE.h"
#endif #endif

View File

@ -14,7 +14,7 @@
#include "src/Core/util/DisableStupidWarnings.h" #include "src/Core/util/DisableStupidWarnings.h"
void *qMalloc(size_t size) void *qMalloc(std::size_t size)
{ {
return Eigen::internal::aligned_malloc(size); return Eigen::internal::aligned_malloc(size);
} }
@ -24,10 +24,10 @@ void qFree(void *ptr)
Eigen::internal::aligned_free(ptr); Eigen::internal::aligned_free(ptr);
} }
void *qRealloc(void *ptr, size_t size) void *qRealloc(void *ptr, std::size_t size)
{ {
void* newPtr = Eigen::internal::aligned_malloc(size); void* newPtr = Eigen::internal::aligned_malloc(size);
memcpy(newPtr, ptr, size); std::memcpy(newPtr, ptr, size);
Eigen::internal::aligned_free(ptr); Eigen::internal::aligned_free(ptr);
return newPtr; return newPtr;
} }

View File

@ -37,7 +37,11 @@
#include "src/SVD/JacobiSVD.h" #include "src/SVD/JacobiSVD.h"
#include "src/SVD/BDCSVD.h" #include "src/SVD/BDCSVD.h"
#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT) #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h" #include "src/misc/lapacke.h"
#endif
#include "src/SVD/JacobiSVD_LAPACKE.h" #include "src/SVD/JacobiSVD_LAPACKE.h"
#endif #endif

View File

@ -25,7 +25,9 @@
#include "SparseCore" #include "SparseCore"
#include "OrderingMethods" #include "OrderingMethods"
#ifndef EIGEN_MPL2_ONLY
#include "SparseCholesky" #include "SparseCholesky"
#endif
#include "SparseLU" #include "SparseLU"
#include "SparseQR" #include "SparseQR"
#include "IterativeLinearSolvers" #include "IterativeLinearSolvers"

View File

@ -28,7 +28,6 @@
* *
*/ */
#include "OrderingMethods"
#include "src/SparseCore/SparseColEtree.h" #include "src/SparseCore/SparseColEtree.h"
#include "src/SparseQR/SparseQR.h" #include "src/SparseQR/SparseQR.h"

View File

@ -14,7 +14,7 @@
#include "Core" #include "Core"
#include <deque> #include <deque>
#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */ #if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
#define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...) #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...)

View File

@ -13,7 +13,7 @@
#include "Core" #include "Core"
#include <list> #include <list>
#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */ #if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
#define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...) #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...)

View File

@ -14,7 +14,7 @@
#include "Core" #include "Core"
#include <vector> #include <vector>
#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */ #if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
#define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...) #define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...)

View File

@ -248,7 +248,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
/** \brief Reports whether previous computation was successful. /** \brief Reports whether previous computation was successful.
* *
* \returns \c Success if computation was succesful, * \returns \c Success if computation was succesful,
* \c NumericalIssue if the matrix.appears to be negative. * \c NumericalIssue if the factorization failed because of a zero pivot.
*/ */
ComputationInfo info() const ComputationInfo info() const
{ {
@ -305,7 +305,8 @@ template<> struct ldlt_inplace<Lower>
if (size <= 1) if (size <= 1)
{ {
transpositions.setIdentity(); transpositions.setIdentity();
if (numext::real(mat.coeff(0,0)) > static_cast<RealScalar>(0) ) sign = PositiveSemiDef; if(size==0) sign = ZeroSign;
else if (numext::real(mat.coeff(0,0)) > static_cast<RealScalar>(0) ) sign = PositiveSemiDef;
else if (numext::real(mat.coeff(0,0)) < static_cast<RealScalar>(0)) sign = NegativeSemiDef; else if (numext::real(mat.coeff(0,0)) < static_cast<RealScalar>(0)) sign = NegativeSemiDef;
else sign = ZeroSign; else sign = ZeroSign;
return true; return true;
@ -376,6 +377,8 @@ template<> struct ldlt_inplace<Lower>
if((rs>0) && pivot_is_valid) if((rs>0) && pivot_is_valid)
A21 /= realAkk; A21 /= realAkk;
else if(rs>0)
ret = ret && (A21.array()==Scalar(0)).all();
if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed
else if(!pivot_is_valid) found_zero_pivot = true; else if(!pivot_is_valid) found_zero_pivot = true;
@ -568,13 +571,14 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
// more precisely, use pseudo-inverse of D (see bug 241) // more precisely, use pseudo-inverse of D (see bug 241)
using std::abs; using std::abs;
const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD()); const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
// In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon // In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min())
// as motivated by LAPACK's xGELSS: // and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS:
// RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest()); // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
// However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
// diagonal element is not well justified and leads to numerical issues in some cases. // diagonal element is not well justified and leads to numerical issues in some cases.
// Moreover, Lapack's xSYTRS routines use 0 for the tolerance. // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest(); // Using numeric_limits::min() gives us more robustness to denormals.
RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
for (Index i = 0; i < vecD.size(); ++i) for (Index i = 0; i < vecD.size(); ++i)
{ {

View File

@ -24,7 +24,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
* *
* \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
* \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
* The other triangular part won't be read. * The other triangular part won't be read.
* *
* This class performs a LL^T Cholesky decomposition of a symmetric, positive definite * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
* matrix A such that A = LL^* = U^*U, where L is lower triangular. * matrix A such that A = LL^* = U^*U, where L is lower triangular.
@ -41,14 +41,18 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
* Example: \include LLT_example.cpp * Example: \include LLT_example.cpp
* Output: \verbinclude LLT_example.out * Output: \verbinclude LLT_example.out
* *
* \b Performance: for best performance, it is recommended to use a column-major storage format
* with the Lower triangular part (the default), or, equivalently, a row-major storage format
* with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization
* step, and rank-updates can be up to 3 times slower.
*
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
* *
* Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered.
* Therefore, the strict lower part does not have to store correct values.
*
* \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
*/ */
/* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
* Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
* the strict lower part does not have to store correct values.
*/
template<typename _MatrixType, int _UpLo> class LLT template<typename _MatrixType, int _UpLo> class LLT
{ {
public: public:
@ -146,7 +150,7 @@ template<typename _MatrixType, int _UpLo> class LLT
} }
template<typename Derived> template<typename Derived>
void solveInPlace(MatrixBase<Derived> &bAndX) const; void solveInPlace(const MatrixBase<Derived> &bAndX) const;
template<typename InputType> template<typename InputType>
LLT& compute(const EigenBase<InputType>& matrix); LLT& compute(const EigenBase<InputType>& matrix);
@ -177,7 +181,7 @@ template<typename _MatrixType, int _UpLo> class LLT
/** \brief Reports whether previous computation was successful. /** \brief Reports whether previous computation was successful.
* *
* \returns \c Success if computation was succesful, * \returns \c Success if computation was succesful,
* \c NumericalIssue if the matrix.appears to be negative. * \c NumericalIssue if the matrix.appears not to be positive definite.
*/ */
ComputationInfo info() const ComputationInfo info() const
{ {
@ -425,7 +429,8 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
eigen_assert(a.rows()==a.cols()); eigen_assert(a.rows()==a.cols());
const Index size = a.rows(); const Index size = a.rows();
m_matrix.resize(size, size); m_matrix.resize(size, size);
m_matrix = a.derived(); if (!internal::is_same_dense(m_matrix, a.derived()))
m_matrix = a.derived();
// Compute matrix L1 norm = max abs column sum. // Compute matrix L1 norm = max abs column sum.
m_l1_norm = RealScalar(0); m_l1_norm = RealScalar(0);
@ -485,11 +490,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
* *
* This version avoids a copy when the right hand side matrix b is not needed anymore. * This version avoids a copy when the right hand side matrix b is not needed anymore.
* *
* \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
* This function will const_cast it, so constness isn't honored here.
*
* \sa LLT::solve(), MatrixBase::llt() * \sa LLT::solve(), MatrixBase::llt()
*/ */
template<typename MatrixType, int _UpLo> template<typename MatrixType, int _UpLo>
template<typename Derived> template<typename Derived>
void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const void LLT<MatrixType,_UpLo>::solveInPlace(const MatrixBase<Derived> &bAndX) const
{ {
eigen_assert(m_isInitialized && "LLT is not initialized."); eigen_assert(m_isInitialized && "LLT is not initialized.");
eigen_assert(m_matrix.rows()==bAndX.rows()); eigen_assert(m_matrix.rows()==bAndX.rows());

View File

@ -153,8 +153,6 @@ class Array
: Base(std::move(other)) : Base(std::move(other))
{ {
Base::_check_template_params(); Base::_check_template_params();
if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
Base::_set_noalias(other);
} }
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value) Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
@ -231,10 +229,16 @@ class Array
: Base(other) : Base(other)
{ } { }
private:
struct PrivateType {};
public:
/** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */ /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other) EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other,
typename internal::enable_if<internal::is_convertible<typename OtherDerived::Scalar,Scalar>::value,
PrivateType>::type = PrivateType())
: Base(other.derived()) : Base(other.derived())
{ } { }

View File

@ -153,8 +153,8 @@ template<typename Derived> class ArrayBase
// inline void evalTo(Dest& dst) const { dst = matrix(); } // inline void evalTo(Dest& dst) const { dst = matrix(); }
protected: protected:
EIGEN_DEVICE_FUNC EIGEN_DEFAULT_COPY_CONSTRUCTOR(ArrayBase)
ArrayBase() : Base() {} EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(ArrayBase)
private: private:
explicit ArrayBase(Index); explicit ArrayBase(Index);
@ -175,7 +175,7 @@ template<typename Derived> class ArrayBase
*/ */
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived & EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other) ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
{ {
call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>()); call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -188,7 +188,7 @@ ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
*/ */
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived & EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other) ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
{ {
call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>()); call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -201,7 +201,7 @@ ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
*/ */
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived & EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other) ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
{ {
call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>()); call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -214,7 +214,7 @@ ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
*/ */
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived & EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other) ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
{ {
call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>()); call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());

View File

@ -32,7 +32,8 @@ struct traits<ArrayWrapper<ExpressionType> >
// Let's remove NestByRefBit // Let's remove NestByRefBit
enum { enum {
Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags, Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
Flags = Flags0 & ~NestByRefBit LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
}; };
}; };
} }
@ -129,7 +130,8 @@ struct traits<MatrixWrapper<ExpressionType> >
// Let's remove NestByRefBit // Let's remove NestByRefBit
enum { enum {
Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags, Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
Flags = Flags0 & ~NestByRefBit LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
}; };
}; };
} }

View File

@ -39,7 +39,7 @@ public:
enum { enum {
DstAlignment = DstEvaluator::Alignment, DstAlignment = DstEvaluator::Alignment,
SrcAlignment = SrcEvaluator::Alignment, SrcAlignment = SrcEvaluator::Alignment,
DstHasDirectAccess = DstFlags & DirectAccessBit, DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit,
JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment) JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
}; };
@ -83,7 +83,7 @@ private:
&& int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0 && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
&& (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)), && (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)),
MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit), MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess)
&& (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
/* If the destination isn't aligned, we have to do runtime checks and we don't unroll, /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
so it's only good for large enough sizes. */ so it's only good for large enough sizes. */
@ -515,7 +515,7 @@ struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
template<typename Kernel> template<typename Kernel>
struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling> struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
{ {
EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel) EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
{ {
typedef typename Kernel::Scalar Scalar; typedef typename Kernel::Scalar Scalar;
typedef typename Kernel::PacketType PacketType; typedef typename Kernel::PacketType PacketType;
@ -563,7 +563,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
template<typename Kernel> template<typename Kernel>
struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling> struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
{ {
EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel) EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
{ {
typedef typename Kernel::DstEvaluatorType::XprType DstXprType; typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
typedef typename Kernel::PacketType PacketType; typedef typename Kernel::PacketType PacketType;
@ -701,6 +701,26 @@ protected:
* Part 5 : Entry point for dense rectangular assignment * Part 5 : Entry point for dense rectangular assignment
***************************************************************************/ ***************************************************************************/
template<typename DstXprType,typename SrcXprType, typename Functor>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const Functor &/*func*/)
{
EIGEN_ONLY_USED_FOR_DEBUG(dst);
EIGEN_ONLY_USED_FOR_DEBUG(src);
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
}
template<typename DstXprType,typename SrcXprType, typename T1, typename T2>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const internal::assign_op<T1,T2> &/*func*/)
{
Index dstRows = src.rows();
Index dstCols = src.cols();
if(((dst.rows()!=dstRows) || (dst.cols()!=dstCols)))
dst.resize(dstRows, dstCols);
eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols);
}
template<typename DstXprType, typename SrcXprType, typename Functor> template<typename DstXprType, typename SrcXprType, typename Functor>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func)
{ {
@ -711,10 +731,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType
// NOTE To properly handle A = (A*A.transpose())/s with A rectangular, // NOTE To properly handle A = (A*A.transpose())/s with A rectangular,
// we need to resize the destination after the source evaluator has been created. // we need to resize the destination after the source evaluator has been created.
Index dstRows = src.rows(); resize_if_allowed(dst, src, func);
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
DstEvaluatorType dstEvaluator(dst); DstEvaluatorType dstEvaluator(dst);

View File

@ -84,7 +84,8 @@ class vml_assign_traits
struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>, \ struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>, \
Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> { \ Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> { \
typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType; \ typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType; \
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) { \ static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) { \
resize_if_allowed(dst, src, func); \
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) { \ if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) { \
VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \ VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \
@ -144,7 +145,8 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _)
Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> { \ Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> { \
typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested, \ typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested, \
const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType; \ const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType; \
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) { \ static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) { \
resize_if_allowed(dst, src, func); \
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other); \ VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other); \
if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) \ if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) \

View File

@ -160,7 +160,7 @@ rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Deco
{ {
typedef typename Decomposition::RealScalar RealScalar; typedef typename Decomposition::RealScalar RealScalar;
eigen_assert(dec.rows() == dec.cols()); eigen_assert(dec.rows() == dec.cols());
if (dec.rows() == 0) return RealScalar(1); if (dec.rows() == 0) return NumTraits<RealScalar>::infinity();
if (matrix_norm == RealScalar(0)) return RealScalar(0); if (matrix_norm == RealScalar(0)) return RealScalar(0);
if (dec.rows() == 1) return RealScalar(1); if (dec.rows() == 1) return RealScalar(1);
const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec); const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec);

View File

@ -977,7 +977,7 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
OuterStrideAtCompileTime = HasSameStorageOrderAsArgType OuterStrideAtCompileTime = HasSameStorageOrderAsArgType
? int(outer_stride_at_compile_time<ArgType>::ret) ? int(outer_stride_at_compile_time<ArgType>::ret)
: int(inner_stride_at_compile_time<ArgType>::ret), : int(inner_stride_at_compile_time<ArgType>::ret),
MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0,
FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0, FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
FlagsRowMajorBit = XprType::Flags&RowMajorBit, FlagsRowMajorBit = XprType::Flags&RowMajorBit,
@ -987,7 +987,9 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit, Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
PacketAlignment = unpacket_traits<PacketScalar>::alignment, PacketAlignment = unpacket_traits<PacketScalar>::alignment,
Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0, Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic)
&& (OuterStrideAtCompileTime!=0)
&& (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0) Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
}; };
typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type; typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
@ -1018,14 +1020,16 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block) EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
: m_argImpl(block.nestedExpression()), : m_argImpl(block.nestedExpression()),
m_startRow(block.startRow()), m_startRow(block.startRow()),
m_startCol(block.startCol()) m_startCol(block.startCol()),
m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0)
{ } { }
typedef typename XprType::Scalar Scalar; typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::CoeffReturnType CoeffReturnType;
enum { enum {
RowsAtCompileTime = XprType::RowsAtCompileTime RowsAtCompileTime = XprType::RowsAtCompileTime,
ForwardLinearAccess = InnerPanel && bool(evaluator<ArgType>::Flags&LinearAccessBit)
}; };
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -1037,7 +1041,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
CoeffReturnType coeff(Index index) const CoeffReturnType coeff(Index index) const
{ {
return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); if (ForwardLinearAccess)
return m_argImpl.coeff(m_linear_offset.value() + index);
else
return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -1049,7 +1056,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Scalar& coeffRef(Index index) Scalar& coeffRef(Index index)
{ {
return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); if (ForwardLinearAccess)
return m_argImpl.coeffRef(m_linear_offset.value() + index);
else
return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
} }
template<int LoadMode, typename PacketType> template<int LoadMode, typename PacketType>
@ -1063,8 +1073,11 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_STRONG_INLINE EIGEN_STRONG_INLINE
PacketType packet(Index index) const PacketType packet(Index index) const
{ {
return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index, if (ForwardLinearAccess)
RowsAtCompileTime == 1 ? index : 0); return m_argImpl.template packet<LoadMode,PacketType>(m_linear_offset.value() + index);
else
return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
RowsAtCompileTime == 1 ? index : 0);
} }
template<int StoreMode, typename PacketType> template<int StoreMode, typename PacketType>
@ -1078,15 +1091,19 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_STRONG_INLINE EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketType& x) void writePacket(Index index, const PacketType& x)
{ {
return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index, if (ForwardLinearAccess)
RowsAtCompileTime == 1 ? index : 0, return m_argImpl.template writePacket<StoreMode,PacketType>(m_linear_offset.value() + index, x);
x); else
return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
RowsAtCompileTime == 1 ? index : 0,
x);
} }
protected: protected:
evaluator<ArgType> m_argImpl; evaluator<ArgType> m_argImpl;
const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow; const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol; const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
const variable_if_dynamic<Index, InnerPanel ? Dynamic : 0> m_linear_offset;
}; };
// TODO: This evaluator does not actually use the child evaluator; // TODO: This evaluator does not actually use the child evaluator;
@ -1556,9 +1573,7 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
{ } { }
typedef typename XprType::Scalar Scalar; typedef typename XprType::Scalar Scalar;
// FIXME having to check whether ArgType is sparse here i not very nice. typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value,
typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
CoeffReturnType coeff(Index row, Index) const CoeffReturnType coeff(Index row, Index) const

View File

@ -105,7 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
*/ */
template<typename Derived> template<typename Derived>
template<typename CustomNullaryOp> template<typename CustomNullaryOp>
EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func) DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
{ {
return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func); return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
@ -150,7 +150,7 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
*/ */
template<typename Derived> template<typename Derived>
template<typename CustomNullaryOp> template<typename CustomNullaryOp>
EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func) DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
{ {
return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func); return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
@ -192,7 +192,7 @@ DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
* \sa class CwiseNullaryOp * \sa class CwiseNullaryOp
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Constant(Index size, const Scalar& value) DenseBase<Derived>::Constant(Index size, const Scalar& value)
{ {
return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value)); return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value));
@ -208,7 +208,7 @@ DenseBase<Derived>::Constant(Index size, const Scalar& value)
* \sa class CwiseNullaryOp * \sa class CwiseNullaryOp
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Constant(const Scalar& value) DenseBase<Derived>::Constant(const Scalar& value)
{ {
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@ -220,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
* \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&) * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&)
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high) DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -232,7 +232,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const
* \sa LinSpaced(Scalar,Scalar) * \sa LinSpaced(Scalar,Scalar)
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high) DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -264,7 +264,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig
* \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high) DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -276,7 +276,7 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
* Special version for fixed size types which does not require the size parameter. * Special version for fixed size types which does not require the size parameter.
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high) DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -286,7 +286,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
/** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */ /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
template<typename Derived> template<typename Derived>
bool DenseBase<Derived>::isApproxToConstant EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApproxToConstant
(const Scalar& val, const RealScalar& prec) const (const Scalar& val, const RealScalar& prec) const
{ {
typename internal::nested_eval<Derived,1>::type self(derived()); typename internal::nested_eval<Derived,1>::type self(derived());
@ -301,7 +301,7 @@ bool DenseBase<Derived>::isApproxToConstant
* *
* \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */ * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
template<typename Derived> template<typename Derived>
bool DenseBase<Derived>::isConstant EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isConstant
(const Scalar& val, const RealScalar& prec) const (const Scalar& val, const RealScalar& prec) const
{ {
return isApproxToConstant(val, prec); return isApproxToConstant(val, prec);
@ -312,7 +312,7 @@ bool DenseBase<Derived>::isConstant
* \sa setConstant(), Constant(), class CwiseNullaryOp * \sa setConstant(), Constant(), class CwiseNullaryOp
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
{ {
setConstant(val); setConstant(val);
} }
@ -322,7 +322,7 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
* \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes() * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
{ {
return derived() = Constant(rows(), cols(), val); return derived() = Constant(rows(), cols(), val);
} }
@ -337,7 +337,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
* \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val) PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
{ {
resize(size); resize(size);
@ -356,7 +356,7 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
* \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val) PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
{ {
resize(rows, cols); resize(rows, cols);
@ -380,7 +380,7 @@ PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
* \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize)); return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize));
@ -400,7 +400,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
* \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return setLinSpaced(size(), low, high); return setLinSpaced(size(), low, high);
@ -423,7 +423,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low,
* \sa Zero(), Zero(Index) * \sa Zero(), Zero(Index)
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Zero(Index rows, Index cols) DenseBase<Derived>::Zero(Index rows, Index cols)
{ {
return Constant(rows, cols, Scalar(0)); return Constant(rows, cols, Scalar(0));
@ -446,7 +446,7 @@ DenseBase<Derived>::Zero(Index rows, Index cols)
* \sa Zero(), Zero(Index,Index) * \sa Zero(), Zero(Index,Index)
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Zero(Index size) DenseBase<Derived>::Zero(Index size)
{ {
return Constant(size, Scalar(0)); return Constant(size, Scalar(0));
@ -463,7 +463,7 @@ DenseBase<Derived>::Zero(Index size)
* \sa Zero(Index), Zero(Index,Index) * \sa Zero(Index), Zero(Index,Index)
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Zero() DenseBase<Derived>::Zero()
{ {
return Constant(Scalar(0)); return Constant(Scalar(0));
@ -478,7 +478,7 @@ DenseBase<Derived>::Zero()
* \sa class CwiseNullaryOp, Zero() * \sa class CwiseNullaryOp, Zero()
*/ */
template<typename Derived> template<typename Derived>
bool DenseBase<Derived>::isZero(const RealScalar& prec) const EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isZero(const RealScalar& prec) const
{ {
typename internal::nested_eval<Derived,1>::type self(derived()); typename internal::nested_eval<Derived,1>::type self(derived());
for(Index j = 0; j < cols(); ++j) for(Index j = 0; j < cols(); ++j)
@ -496,7 +496,7 @@ bool DenseBase<Derived>::isZero(const RealScalar& prec) const
* \sa class CwiseNullaryOp, Zero() * \sa class CwiseNullaryOp, Zero()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero() EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
{ {
return setConstant(Scalar(0)); return setConstant(Scalar(0));
} }
@ -511,7 +511,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
* \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero() * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setZero(Index newSize) PlainObjectBase<Derived>::setZero(Index newSize)
{ {
resize(newSize); resize(newSize);
@ -529,7 +529,7 @@ PlainObjectBase<Derived>::setZero(Index newSize)
* \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero() * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setZero(Index rows, Index cols) PlainObjectBase<Derived>::setZero(Index rows, Index cols)
{ {
resize(rows, cols); resize(rows, cols);
@ -553,7 +553,7 @@ PlainObjectBase<Derived>::setZero(Index rows, Index cols)
* \sa Ones(), Ones(Index), isOnes(), class Ones * \sa Ones(), Ones(Index), isOnes(), class Ones
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Ones(Index rows, Index cols) DenseBase<Derived>::Ones(Index rows, Index cols)
{ {
return Constant(rows, cols, Scalar(1)); return Constant(rows, cols, Scalar(1));
@ -576,7 +576,7 @@ DenseBase<Derived>::Ones(Index rows, Index cols)
* \sa Ones(), Ones(Index,Index), isOnes(), class Ones * \sa Ones(), Ones(Index,Index), isOnes(), class Ones
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Ones(Index newSize) DenseBase<Derived>::Ones(Index newSize)
{ {
return Constant(newSize, Scalar(1)); return Constant(newSize, Scalar(1));
@ -593,7 +593,7 @@ DenseBase<Derived>::Ones(Index newSize)
* \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Ones() DenseBase<Derived>::Ones()
{ {
return Constant(Scalar(1)); return Constant(Scalar(1));
@ -608,7 +608,7 @@ DenseBase<Derived>::Ones()
* \sa class CwiseNullaryOp, Ones() * \sa class CwiseNullaryOp, Ones()
*/ */
template<typename Derived> template<typename Derived>
bool DenseBase<Derived>::isOnes EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isOnes
(const RealScalar& prec) const (const RealScalar& prec) const
{ {
return isApproxToConstant(Scalar(1), prec); return isApproxToConstant(Scalar(1), prec);
@ -622,7 +622,7 @@ bool DenseBase<Derived>::isOnes
* \sa class CwiseNullaryOp, Ones() * \sa class CwiseNullaryOp, Ones()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes() EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
{ {
return setConstant(Scalar(1)); return setConstant(Scalar(1));
} }
@ -637,7 +637,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
* \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones() * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setOnes(Index newSize) PlainObjectBase<Derived>::setOnes(Index newSize)
{ {
resize(newSize); resize(newSize);
@ -655,7 +655,7 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
* \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones() * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setOnes(Index rows, Index cols) PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
{ {
resize(rows, cols); resize(rows, cols);
@ -679,7 +679,7 @@ PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
* \sa Identity(), setIdentity(), isIdentity() * \sa Identity(), setIdentity(), isIdentity()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
MatrixBase<Derived>::Identity(Index rows, Index cols) MatrixBase<Derived>::Identity(Index rows, Index cols)
{ {
return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>()); return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
@ -696,7 +696,7 @@ MatrixBase<Derived>::Identity(Index rows, Index cols)
* \sa Identity(Index,Index), setIdentity(), isIdentity() * \sa Identity(Index,Index), setIdentity(), isIdentity()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
MatrixBase<Derived>::Identity() MatrixBase<Derived>::Identity()
{ {
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@ -771,7 +771,7 @@ struct setIdentity_impl<Derived, true>
* \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity() * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity() EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
{ {
return internal::setIdentity_impl<Derived>::run(derived()); return internal::setIdentity_impl<Derived>::run(derived());
} }
@ -787,7 +787,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity() * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
{ {
derived().resize(rows, cols); derived().resize(rows, cols);
return setIdentity(); return setIdentity();
@ -800,7 +800,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index
* \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i); return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i);
@ -815,7 +815,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return BasisReturnType(SquareMatrixType::Identity(),i); return BasisReturnType(SquareMatrixType::Identity(),i);
@ -828,7 +828,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX() EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
{ return Derived::Unit(0); } { return Derived::Unit(0); }
/** \returns an expression of the Y axis unit vector (0,1{,0}^*) /** \returns an expression of the Y axis unit vector (0,1{,0}^*)
@ -838,7 +838,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY() EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
{ return Derived::Unit(1); } { return Derived::Unit(1); }
/** \returns an expression of the Z axis unit vector (0,0,1{,0}^*) /** \returns an expression of the Z axis unit vector (0,0,1{,0}^*)
@ -848,7 +848,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ() EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
{ return Derived::Unit(2); } { return Derived::Unit(2); }
/** \returns an expression of the W axis unit vector (0,0,0,1) /** \returns an expression of the W axis unit vector (0,0,0,1)
@ -858,7 +858,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW() EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
{ return Derived::Unit(3); } { return Derived::Unit(3); }
} // end namespace Eigen } // end namespace Eigen

View File

@ -121,6 +121,8 @@ class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
{ {
return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar); return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
} }
protected:
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(CwiseUnaryViewImpl)
}; };
} // end namespace Eigen } // end namespace Eigen

View File

@ -40,7 +40,7 @@ static inline void check_DenseIndex_is_signed() {
*/ */
template<typename Derived> class DenseBase template<typename Derived> class DenseBase
#ifndef EIGEN_PARSED_BY_DOXYGEN #ifndef EIGEN_PARSED_BY_DOXYGEN
: public DenseCoeffsBase<Derived> : public DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value>
#else #else
: public DenseCoeffsBase<Derived,DirectWriteAccessors> : public DenseCoeffsBase<Derived,DirectWriteAccessors>
#endif // not EIGEN_PARSED_BY_DOXYGEN #endif // not EIGEN_PARSED_BY_DOXYGEN
@ -71,7 +71,7 @@ template<typename Derived> class DenseBase
typedef Scalar value_type; typedef Scalar value_type;
typedef typename NumTraits<Scalar>::Real RealScalar; typedef typename NumTraits<Scalar>::Real RealScalar;
typedef DenseCoeffsBase<Derived> Base; typedef DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value> Base;
using Base::derived; using Base::derived;
using Base::const_cast_derived; using Base::const_cast_derived;
@ -296,7 +296,7 @@ template<typename Derived> class DenseBase
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
Derived& operator=(const ReturnByValue<OtherDerived>& func); Derived& operator=(const ReturnByValue<OtherDerived>& func);
/** \ínternal /** \internal
* Copies \a other into *this without evaluating other. \returns a reference to *this. * Copies \a other into *this without evaluating other. \returns a reference to *this.
* \deprecated */ * \deprecated */
template<typename OtherDerived> template<typename OtherDerived>
@ -463,7 +463,17 @@ template<typename Derived> class DenseBase
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
void visit(Visitor& func) const; void visit(Visitor& func) const;
inline const WithFormat<Derived> format(const IOFormat& fmt) const; /** \returns a WithFormat proxy object allowing to print a matrix the with given
* format \a fmt.
*
* See class IOFormat for some examples.
*
* \sa class IOFormat, class WithFormat
*/
inline const WithFormat<Derived> format(const IOFormat& fmt) const
{
return WithFormat<Derived>(derived(), fmt);
}
/** \returns the unique coefficient of a 1x1 expression */ /** \returns the unique coefficient of a 1x1 expression */
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
@ -474,9 +484,9 @@ template<typename Derived> class DenseBase
return derived().coeff(0,0); return derived().coeff(0,0);
} }
bool all() const; EIGEN_DEVICE_FUNC bool all() const;
bool any() const; EIGEN_DEVICE_FUNC bool any() const;
Index count() const; EIGEN_DEVICE_FUNC Index count() const;
typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType; typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType; typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;
@ -577,11 +587,12 @@ template<typename Derived> class DenseBase
} }
protected: protected:
EIGEN_DEFAULT_COPY_CONSTRUCTOR(DenseBase)
/** Default constructor. Do nothing. */ /** Default constructor. Do nothing. */
EIGEN_DEVICE_FUNC DenseBase() EIGEN_DEVICE_FUNC DenseBase()
{ {
/* Just checks for self-consistency of the flags. /* Just checks for self-consistency of the flags.
* Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down * Only do it when debugging Eigen, as this borders on paranoia and could slow compilation down
*/ */
#ifdef EIGEN_INTERNAL_DEBUGGING #ifdef EIGEN_INTERNAL_DEBUGGING
EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor)) EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor))

View File

@ -13,9 +13,9 @@
#define EIGEN_MATRIXSTORAGE_H #define EIGEN_MATRIXSTORAGE_H
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
#define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_DENSE_STORAGE_CTOR_PLUGIN; #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) X; EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
#else #else
#define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X)
#endif #endif
namespace Eigen { namespace Eigen {
@ -184,12 +184,16 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
{ {
internal::plain_array<T,Size,_Options> m_data; internal::plain_array<T,Size,_Options> m_data;
public: public:
EIGEN_DEVICE_FUNC DenseStorage() {} EIGEN_DEVICE_FUNC DenseStorage() {
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
}
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
explicit DenseStorage(internal::constructor_without_unaligned_array_assert) explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
: m_data(internal::constructor_without_unaligned_array_assert()) {} : m_data(internal::constructor_without_unaligned_array_assert()) {}
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
DenseStorage(const DenseStorage& other) : m_data(other.m_data) {} DenseStorage(const DenseStorage& other) : m_data(other.m_data) {
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
}
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
DenseStorage& operator=(const DenseStorage& other) DenseStorage& operator=(const DenseStorage& other)
{ {
@ -197,7 +201,7 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
return *this; return *this;
} }
EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) { EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols); eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols);
EIGEN_UNUSED_VARIABLE(size); EIGEN_UNUSED_VARIABLE(size);
EIGEN_UNUSED_VARIABLE(rows); EIGEN_UNUSED_VARIABLE(rows);
@ -343,7 +347,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
: m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows), m_cols(cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows), m_cols(cols)
{ {
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0); eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0);
} }
EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
@ -351,6 +355,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
, m_rows(other.m_rows) , m_rows(other.m_rows)
, m_cols(other.m_cols) , m_cols(other.m_cols)
{ {
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*m_cols)
internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data); internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data);
} }
EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@ -399,11 +404,11 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
if(size != m_rows*m_cols) if(size != m_rows*m_cols)
{ {
internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols);
if (size) if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size); m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
else else
m_data = 0; m_data = 0;
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
} }
m_rows = rows; m_rows = rows;
m_cols = cols; m_cols = cols;
@ -422,7 +427,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {} explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(cols) EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(cols)
{ {
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0); eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0);
EIGEN_UNUSED_VARIABLE(rows); EIGEN_UNUSED_VARIABLE(rows);
} }
@ -430,6 +435,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
: m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols)) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols))
, m_cols(other.m_cols) , m_cols(other.m_cols)
{ {
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols*_Rows)
internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data); internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data);
} }
EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@ -473,11 +479,11 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
if(size != _Rows*m_cols) if(size != _Rows*m_cols)
{ {
internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols);
if (size) if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size); m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
else else
m_data = 0; m_data = 0;
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
} }
m_cols = cols; m_cols = cols;
} }
@ -495,7 +501,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {} explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows) EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows)
{ {
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols); eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols);
EIGEN_UNUSED_VARIABLE(cols); EIGEN_UNUSED_VARIABLE(cols);
} }
@ -503,6 +509,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
: m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols)) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols))
, m_rows(other.m_rows) , m_rows(other.m_rows)
{ {
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*_Cols)
internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data); internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data);
} }
EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@ -546,11 +553,11 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
if(size != m_rows*_Cols) if(size != m_rows*_Cols)
{ {
internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows);
if (size) if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size); m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
else else
m_data = 0; m_data = 0;
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
} }
m_rows = rows; m_rows = rows;
} }

View File

@ -21,7 +21,7 @@ namespace Eigen {
* \param MatrixType the type of the object in which we are taking a sub/main/super diagonal * \param MatrixType the type of the object in which we are taking a sub/main/super diagonal
* \param DiagIndex the index of the sub/super diagonal. The default is 0 and it means the main diagonal. * \param DiagIndex the index of the sub/super diagonal. The default is 0 and it means the main diagonal.
* A positive value means a superdiagonal, a negative value means a subdiagonal. * A positive value means a superdiagonal, a negative value means a subdiagonal.
* You can also use Dynamic so the index can be set at runtime. * You can also use DynamicIndex so the index can be set at runtime.
* *
* The matrix is not required to be square. * The matrix is not required to be square.
* *
@ -70,7 +70,10 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal) EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {} explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index)
{
eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() );
}
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)

View File

@ -31,7 +31,8 @@ struct dot_nocheck
typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod; typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
typedef typename conj_prod::result_type ResScalar; typedef typename conj_prod::result_type ResScalar;
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b) EIGEN_STRONG_INLINE
static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
{ {
return a.template binaryExpr<conj_prod>(b).sum(); return a.template binaryExpr<conj_prod>(b).sum();
} }
@ -43,7 +44,8 @@ struct dot_nocheck<T, U, true>
typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod; typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
typedef typename conj_prod::result_type ResScalar; typedef typename conj_prod::result_type ResScalar;
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b) EIGEN_STRONG_INLINE
static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
{ {
return a.transpose().template binaryExpr<conj_prod>(b).sum(); return a.transpose().template binaryExpr<conj_prod>(b).sum();
} }
@ -65,6 +67,7 @@ struct dot_nocheck<T, U, true>
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
{ {
@ -102,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
* \sa lpNorm(), dot(), squaredNorm() * \sa lpNorm(), dot(), squaredNorm()
*/ */
template<typename Derived> template<typename Derived>
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
{ {
return numext::sqrt(squaredNorm()); return numext::sqrt(squaredNorm());
} }
@ -117,7 +120,7 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real Matr
* \sa norm(), normalize() * \sa norm(), normalize()
*/ */
template<typename Derived> template<typename Derived>
inline const typename MatrixBase<Derived>::PlainObject EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
MatrixBase<Derived>::normalized() const MatrixBase<Derived>::normalized() const
{ {
typedef typename internal::nested_eval<Derived,2>::type _Nested; typedef typename internal::nested_eval<Derived,2>::type _Nested;
@ -139,7 +142,7 @@ MatrixBase<Derived>::normalized() const
* \sa norm(), normalized() * \sa norm(), normalized()
*/ */
template<typename Derived> template<typename Derived>
inline void MatrixBase<Derived>::normalize() EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
{ {
RealScalar z = squaredNorm(); RealScalar z = squaredNorm();
// NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
@ -160,7 +163,7 @@ inline void MatrixBase<Derived>::normalize()
* \sa stableNorm(), stableNormalize(), normalized() * \sa stableNorm(), stableNormalize(), normalized()
*/ */
template<typename Derived> template<typename Derived>
inline const typename MatrixBase<Derived>::PlainObject EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
MatrixBase<Derived>::stableNormalized() const MatrixBase<Derived>::stableNormalized() const
{ {
typedef typename internal::nested_eval<Derived,3>::type _Nested; typedef typename internal::nested_eval<Derived,3>::type _Nested;
@ -185,7 +188,7 @@ MatrixBase<Derived>::stableNormalized() const
* \sa stableNorm(), stableNormalized(), normalize() * \sa stableNorm(), stableNormalized(), normalize()
*/ */
template<typename Derived> template<typename Derived>
inline void MatrixBase<Derived>::stableNormalize() EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
{ {
RealScalar w = cwiseAbs().maxCoeff(); RealScalar w = cwiseAbs().maxCoeff();
RealScalar z = (derived()/w).squaredNorm(); RealScalar z = (derived()/w).squaredNorm();

View File

@ -14,6 +14,7 @@
namespace Eigen { namespace Eigen {
/** \class EigenBase /** \class EigenBase
* \ingroup Core_Module
* *
* Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T). * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
* *
@ -128,6 +129,7 @@ template<typename Derived> struct EigenBase
*/ */
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_DEVICE_FUNC
Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other) Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
{ {
call_assignment(derived(), other.derived()); call_assignment(derived(), other.derived());
@ -136,6 +138,7 @@ Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_DEVICE_FUNC
Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other) Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
{ {
call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>()); call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -144,6 +147,7 @@ Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_DEVICE_FUNC
Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other) Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
{ {
call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>()); call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());

View File

@ -24,12 +24,17 @@ template<int Rows, int Cols, int Depth> struct product_type_selector;
template<int Size, int MaxSize> struct product_size_category template<int Size, int MaxSize> struct product_size_category
{ {
enum { is_large = MaxSize == Dynamic || enum {
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || #ifndef EIGEN_CUDA_ARCH
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), is_large = MaxSize == Dynamic ||
value = is_large ? Large Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
: Size == 1 ? 1 (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
: Small #else
is_large = 0,
#endif
value = is_large ? Large
: Size == 1 ? 1
: Small
}; };
}; };
@ -379,8 +384,6 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
* *
* \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*() * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
*/ */
#ifndef __CUDACC__
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
inline const Product<Derived, OtherDerived> inline const Product<Derived, OtherDerived>
@ -412,8 +415,6 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
return Product<Derived, OtherDerived>(derived(), other.derived()); return Product<Derived, OtherDerived>(derived(), other.derived());
} }
#endif // __CUDACC__
/** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation. /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
* *
* The returned product will behave like any other expressions: the coefficients of the product will be * The returned product will behave like any other expressions: the coefficients of the product will be

View File

@ -230,7 +230,7 @@ pload1(const typename unpacket_traits<Packet>::type *a) { return pset1<Packet>(
* duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]} * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
* Currently, this function is only used for scalar * complex products. * Currently, this function is only used for scalar * complex products.
*/ */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; } ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
/** \internal \returns a packet with elements of \a *from quadrupled. /** \internal \returns a packet with elements of \a *from quadrupled.
@ -278,7 +278,7 @@ inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
} }
/** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
template<typename Packet> inline Packet template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
plset(const typename unpacket_traits<Packet>::type& a) { return a; } plset(const typename unpacket_traits<Packet>::type& a) { return a; }
/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
@ -351,10 +351,7 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet&
/** \internal \returns \a a with real and imaginary part flipped (for complex type only) */ /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
{ {
// FIXME: uncomment the following in case we drop the internal imag and real functions. return Packet(a.imag(),a.real());
// using std::imag;
// using std::real;
return Packet(imag(a),real(a));
} }
/************************** /**************************
@ -482,7 +479,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro
* by the current computation. * by the current computation.
*/ */
template<typename Packet, int LoadMode> template<typename Packet, int LoadMode>
inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from) EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
{ {
return ploadt<Packet, LoadMode>(from); return ploadt<Packet, LoadMode>(from);
} }
@ -524,10 +521,10 @@ inline void palign(PacketType& first, const PacketType& second)
#ifndef __CUDACC__ #ifndef __CUDACC__
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b) template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } { return std::complex<float>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }
template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b) template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
{ return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } { return std::complex<double>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }
#endif #endif

View File

@ -109,20 +109,6 @@ class WithFormat
IOFormat m_format; IOFormat m_format;
}; };
/** \returns a WithFormat proxy object allowing to print a matrix the with given
* format \a fmt.
*
* See class IOFormat for some examples.
*
* \sa class IOFormat, class WithFormat
*/
template<typename Derived>
inline const WithFormat<Derived>
DenseBase<Derived>::format(const IOFormat& fmt) const
{
return WithFormat<Derived>(derived(), fmt);
}
namespace internal { namespace internal {
// NOTE: This helper is kept for backward compatibility with previous code specializing // NOTE: This helper is kept for backward compatibility with previous code specializing

View File

@ -20,11 +20,17 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
{ {
typedef traits<PlainObjectType> TraitsBase; typedef traits<PlainObjectType> TraitsBase;
enum { enum {
PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags&RowMajorBit)==RowMajorBit)
? PlainObjectType::ColsAtCompileTime
: PlainObjectType::RowsAtCompileTime,
InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
? int(PlainObjectType::InnerStrideAtCompileTime) ? int(PlainObjectType::InnerStrideAtCompileTime)
: int(StrideType::InnerStrideAtCompileTime), : int(StrideType::InnerStrideAtCompileTime),
OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
? int(PlainObjectType::OuterStrideAtCompileTime) ? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic
? Dynamic
: int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
: int(StrideType::OuterStrideAtCompileTime), : int(StrideType::OuterStrideAtCompileTime),
Alignment = int(MapOptions)&int(AlignedMask), Alignment = int(MapOptions)&int(AlignedMask),
Flags0 = TraitsBase::Flags & (~NestByRefBit), Flags0 = TraitsBase::Flags & (~NestByRefBit),
@ -107,10 +113,11 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline Index outerStride() const inline Index outerStride() const
{ {
return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() return int(StrideType::OuterStrideAtCompileTime) != 0 ? m_stride.outer()
: IsVectorAtCompileTime ? this->size() : int(internal::traits<Map>::OuterStrideAtCompileTime) != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
: int(Flags)&RowMajorBit ? this->cols() : IsVectorAtCompileTime ? (this->size() * innerStride())
: this->rows(); : (int(Flags)&RowMajorBit) ? (this->cols() * innerStride())
: (this->rows() * innerStride());
} }
/** Constructor in the fixed-size case. /** Constructor in the fixed-size case.

View File

@ -43,6 +43,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
enum { enum {
RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime, RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime, ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
InnerStrideAtCompileTime = internal::traits<Derived>::InnerStrideAtCompileTime,
SizeAtCompileTime = Base::SizeAtCompileTime SizeAtCompileTime = Base::SizeAtCompileTime
}; };
@ -181,14 +182,19 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
#endif #endif
protected: protected:
EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
template<typename T> template<typename T>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const
{ {
#if EIGEN_MAX_ALIGN_BYTES>0 #if EIGEN_MAX_ALIGN_BYTES>0
// innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value:
const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime);
EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride);
eigen_assert(( ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0) eigen_assert(( ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0)
|| (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned"); || (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
#endif #endif
} }
@ -290,6 +296,9 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
// In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base, // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
// see bugs 821 and 920. // see bugs 821 and 920.
using ReadOnlyMapBase::Base::operator=; using ReadOnlyMapBase::Base::operator=;
protected:
EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
}; };
#undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS

View File

@ -287,7 +287,7 @@ struct abs2_impl_default<Scalar, true> // IsComplex
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static inline RealScalar run(const Scalar& x) static inline RealScalar run(const Scalar& x)
{ {
return real(x)*real(x) + imag(x)*imag(x); return x.real()*x.real() + x.imag()*x.imag();
} }
}; };
@ -313,14 +313,17 @@ struct abs2_retval
****************************************************************************/ ****************************************************************************/
template<typename Scalar, bool IsComplex> template<typename Scalar, bool IsComplex>
struct norm1_default_impl struct norm1_default_impl;
template<typename Scalar>
struct norm1_default_impl<Scalar,true>
{ {
typedef typename NumTraits<Scalar>::Real RealScalar; typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static inline RealScalar run(const Scalar& x) static inline RealScalar run(const Scalar& x)
{ {
EIGEN_USING_STD_MATH(abs); EIGEN_USING_STD_MATH(abs);
return abs(real(x)) + abs(imag(x)); return abs(x.real()) + abs(x.imag());
} }
}; };
@ -348,31 +351,7 @@ struct norm1_retval
* Implementation of hypot * * Implementation of hypot *
****************************************************************************/ ****************************************************************************/
template<typename Scalar> template<typename Scalar> struct hypot_impl;
struct hypot_impl
{
typedef typename NumTraits<Scalar>::Real RealScalar;
static inline RealScalar run(const Scalar& x, const Scalar& y)
{
EIGEN_USING_STD_MATH(abs);
EIGEN_USING_STD_MATH(sqrt);
RealScalar _x = abs(x);
RealScalar _y = abs(y);
Scalar p, qp;
if(_x>_y)
{
p = _x;
qp = _y / p;
}
else
{
p = _y;
qp = _x / p;
}
if(p==RealScalar(0)) return RealScalar(0);
return p * sqrt(RealScalar(1) + qp*qp);
}
};
template<typename Scalar> template<typename Scalar>
struct hypot_retval struct hypot_retval
@ -495,7 +474,7 @@ namespace std_fallback {
typedef typename NumTraits<Scalar>::Real RealScalar; typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_USING_STD_MATH(log); EIGEN_USING_STD_MATH(log);
Scalar x1p = RealScalar(1) + x; Scalar x1p = RealScalar(1) + x;
return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
} }
} }
@ -640,21 +619,28 @@ template<typename Scalar>
struct random_default_impl<Scalar, false, true> struct random_default_impl<Scalar, false, true>
{ {
static inline Scalar run(const Scalar& x, const Scalar& y) static inline Scalar run(const Scalar& x, const Scalar& y)
{ {
typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX; if (y <= x)
if(y<x)
return x; return x;
// the following difference might overflow on a 32 bits system, // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
// but since y>=x the result converted to an unsigned long is still correct. typedef typename make_unsigned<Scalar>::type ScalarU;
std::size_t range = ScalarX(y)-ScalarX(x); // ScalarX is the widest of ScalarU and unsigned int.
std::size_t offset = 0; // We'll deal only with ScalarX and unsigned int below thus avoiding signed
// rejection sampling // types and arithmetic and signed overflows (which are undefined behavior).
std::size_t divisor = 1; typedef typename conditional<(ScalarU(-1) > unsigned(-1)), ScalarU, unsigned>::type ScalarX;
std::size_t multiplier = 1; // The following difference doesn't overflow, provided our integer types are two's
if(range<RAND_MAX) divisor = (std::size_t(RAND_MAX)+1)/(range+1); // complement and have the same number of padding bits in signed and unsigned variants.
else multiplier = 1 + range/(std::size_t(RAND_MAX)+1); // This is the case in most modern implementations of C++.
ScalarX range = ScalarX(y) - ScalarX(x);
ScalarX offset = 0;
ScalarX divisor = 1;
ScalarX multiplier = 1;
const unsigned rand_max = RAND_MAX;
if (range <= rand_max) divisor = (rand_max + 1) / (range + 1);
else multiplier = 1 + range / (rand_max + 1);
// Rejection sampling.
do { do {
offset = (std::size_t(std::rand()) * multiplier) / divisor; offset = (unsigned(std::rand()) * multiplier) / divisor;
} while (offset > range); } while (offset > range);
return Scalar(ScalarX(x) + offset); return Scalar(ScalarX(x) + offset);
} }
@ -679,8 +665,8 @@ struct random_default_impl<Scalar, true, false>
{ {
static inline Scalar run(const Scalar& x, const Scalar& y) static inline Scalar run(const Scalar& x, const Scalar& y)
{ {
return Scalar(random(real(x), real(y)), return Scalar(random(x.real(), y.real()),
random(imag(x), imag(y))); random(x.imag(), y.imag()));
} }
static inline Scalar run() static inline Scalar run()
{ {
@ -933,6 +919,9 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x); return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
} }
EIGEN_DEVICE_FUNC
inline bool abs2(bool x) { return x; }
template<typename Scalar> template<typename Scalar>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x) inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
@ -1030,7 +1019,8 @@ inline int log2(int x)
/** \returns the square root of \a x. /** \returns the square root of \a x.
* *
* It is essentially equivalent to \code using std::sqrt; return sqrt(x); \endcode, * It is essentially equivalent to
* \code using std::sqrt; return sqrt(x); \endcode
* but slightly faster for float/double and some compilers (e.g., gcc), thanks to * but slightly faster for float/double and some compilers (e.g., gcc), thanks to
* specializations when SSE is enabled. * specializations when SSE is enabled.
* *
@ -1061,11 +1051,24 @@ double log(const double &x) { return ::log(x); }
template<typename T> template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename NumTraits<T>::Real abs(const T &x) { typename internal::enable_if<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex,typename NumTraits<T>::Real>::type
abs(const T &x) {
EIGEN_USING_STD_MATH(abs); EIGEN_USING_STD_MATH(abs);
return abs(x); return abs(x);
} }
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename internal::enable_if<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex),typename NumTraits<T>::Real>::type
abs(const T &x) {
return x;
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); }
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #ifdef __CUDACC__
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float abs(const float &x) { return ::fabsf(x); } float abs(const float &x) { return ::fabsf(x); }

View File

@ -71,6 +71,29 @@ T generic_fast_tanh_float(const T& a_x)
return pdiv(p, q); return pdiv(p, q);
} }
template<typename RealScalar>
EIGEN_STRONG_INLINE
RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
{
EIGEN_USING_STD_MATH(sqrt);
RealScalar p, qp;
p = numext::maxi(x,y);
if(p==RealScalar(0)) return RealScalar(0);
qp = numext::mini(y,x) / p;
return p * sqrt(RealScalar(1) + qp*qp);
}
template<typename Scalar>
struct hypot_impl
{
typedef typename NumTraits<Scalar>::Real RealScalar;
static inline RealScalar run(const Scalar& x, const Scalar& y)
{
EIGEN_USING_STD_MATH(abs);
return positive_real_hypot<RealScalar>(abs(x), abs(y));
}
};
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen

View File

@ -274,8 +274,6 @@ class Matrix
: Base(std::move(other)) : Base(std::move(other))
{ {
Base::_check_template_params(); Base::_check_template_params();
if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
Base::_set_noalias(other);
} }
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value) Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)

View File

@ -160,20 +160,11 @@ template<typename Derived> class MatrixBase
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Derived& operator-=(const MatrixBase<OtherDerived>& other); Derived& operator-=(const MatrixBase<OtherDerived>& other);
#ifdef __CUDACC__
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
const Product<Derived,OtherDerived,LazyProduct>
operator*(const MatrixBase<OtherDerived> &other) const
{ return this->lazyProduct(other); }
#else
template<typename OtherDerived>
const Product<Derived,OtherDerived> const Product<Derived,OtherDerived>
operator*(const MatrixBase<OtherDerived> &other) const; operator*(const MatrixBase<OtherDerived> &other) const;
#endif
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
const Product<Derived,OtherDerived,LazyProduct> const Product<Derived,OtherDerived,LazyProduct>
@ -294,7 +285,7 @@ template<typename Derived> class MatrixBase
* fuzzy comparison such as isApprox() * fuzzy comparison such as isApprox()
* \sa isApprox(), operator!= */ * \sa isApprox(), operator!= */
template<typename OtherDerived> template<typename OtherDerived>
inline bool operator==(const MatrixBase<OtherDerived>& other) const EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase<OtherDerived>& other) const
{ return cwiseEqual(other).all(); } { return cwiseEqual(other).all(); }
/** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other. /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
@ -302,7 +293,7 @@ template<typename Derived> class MatrixBase
* fuzzy comparison such as isApprox() * fuzzy comparison such as isApprox()
* \sa isApprox(), operator== */ * \sa isApprox(), operator== */
template<typename OtherDerived> template<typename OtherDerived>
inline bool operator!=(const MatrixBase<OtherDerived>& other) const EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
{ return cwiseNotEqual(other).any(); } { return cwiseNotEqual(other).any(); }
NoAlias<Derived,Eigen::MatrixBase > noalias(); NoAlias<Derived,Eigen::MatrixBase > noalias();
@ -453,19 +444,28 @@ template<typename Derived> class MatrixBase
///////// MatrixFunctions module ///////// ///////// MatrixFunctions module /////////
typedef typename internal::stem_function<Scalar>::type StemFunction; typedef typename internal::stem_function<Scalar>::type StemFunction;
const MatrixExponentialReturnValue<Derived> exp() const; #define EIGEN_MATRIX_FUNCTION(ReturnType, Name, Description) \
/** \returns an expression of the matrix Description of \c *this. \brief This function requires the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>. To compute the coefficient-wise Description use ArrayBase::##Name . */ \
const ReturnType<Derived> Name() const;
#define EIGEN_MATRIX_FUNCTION_1(ReturnType, Name, Description, Argument) \
/** \returns an expression of the matrix Description of \c *this. \brief This function requires the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>. To compute the coefficient-wise Description use ArrayBase::##Name . */ \
const ReturnType<Derived> Name(Argument) const;
EIGEN_MATRIX_FUNCTION(MatrixExponentialReturnValue, exp, exponential)
/** \brief Helper function for the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>.*/
const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const; const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const;
const MatrixFunctionReturnValue<Derived> cosh() const; EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine)
const MatrixFunctionReturnValue<Derived> sinh() const; EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine)
const MatrixFunctionReturnValue<Derived> cos() const; EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine)
const MatrixFunctionReturnValue<Derived> sin() const; EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine)
const MatrixSquareRootReturnValue<Derived> sqrt() const; EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root)
const MatrixLogarithmReturnValue<Derived> log() const; EIGEN_MATRIX_FUNCTION(MatrixLogarithmReturnValue, log, logarithm)
const MatrixPowerReturnValue<Derived> pow(const RealScalar& p) const; EIGEN_MATRIX_FUNCTION_1(MatrixPowerReturnValue, pow, power to \c p, const RealScalar& p)
const MatrixComplexPowerReturnValue<Derived> pow(const std::complex<RealScalar>& p) const; EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const std::complex<RealScalar>& p)
protected: protected:
EIGEN_DEVICE_FUNC MatrixBase() : Base() {} EIGEN_DEFAULT_COPY_CONSTRUCTOR(MatrixBase)
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MatrixBase)
private: private:
EIGEN_DEVICE_FUNC explicit MatrixBase(int); EIGEN_DEVICE_FUNC explicit MatrixBase(int);

View File

@ -215,6 +215,8 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); } static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); } static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
static inline int digits10() { return NumTraits<Scalar>::digits10(); }
}; };
template<> struct NumTraits<std::string> template<> struct NumTraits<std::string>

View File

@ -87,17 +87,6 @@ class PermutationBase : public EigenBase<Derived>
return derived(); return derived();
} }
#ifndef EIGEN_PARSED_BY_DOXYGEN
/** This is a special case of the templated operator=. Its purpose is to
* prevent a default operator= from hiding the templated operator=.
*/
Derived& operator=(const PermutationBase& other)
{
indices() = other.indices();
return derived();
}
#endif
/** \returns the number of rows */ /** \returns the number of rows */
inline Index rows() const { return Index(indices().size()); } inline Index rows() const { return Index(indices().size()); }
@ -333,12 +322,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
inline PermutationMatrix(const PermutationBase<OtherDerived>& other) inline PermutationMatrix(const PermutationBase<OtherDerived>& other)
: m_indices(other.indices()) {} : m_indices(other.indices()) {}
#ifndef EIGEN_PARSED_BY_DOXYGEN
/** Standard copy constructor. Defined only to prevent a default copy constructor
* from hiding the other templated constructor */
inline PermutationMatrix(const PermutationMatrix& other) : m_indices(other.indices()) {}
#endif
/** Generic constructor from expression of the indices. The indices /** Generic constructor from expression of the indices. The indices
* array has the meaning that the permutations sends each integer i to indices[i]. * array has the meaning that the permutations sends each integer i to indices[i].
* *
@ -373,17 +356,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
return Base::operator=(tr.derived()); return Base::operator=(tr.derived());
} }
#ifndef EIGEN_PARSED_BY_DOXYGEN
/** This is a special case of the templated operator=. Its purpose is to
* prevent a default operator= from hiding the templated operator=.
*/
PermutationMatrix& operator=(const PermutationMatrix& other)
{
m_indices = other.m_indices;
return *this;
}
#endif
/** const version of indices(). */ /** const version of indices(). */
const IndicesType& indices() const { return m_indices; } const IndicesType& indices() const { return m_indices; }
/** \returns a reference to the stored array representing the permutation. */ /** \returns a reference to the stored array representing the permutation. */

View File

@ -41,7 +41,7 @@ template<> struct check_rows_cols_for_overflow<Dynamic> {
{ {
// http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242 // http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242
// we assume Index is signed // we assume Index is signed
Index max_index = (size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed Index max_index = (std::size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
bool error = (rows == 0 || cols == 0) ? false bool error = (rows == 0 || cols == 0) ? false
: (rows > max_index / cols); : (rows > max_index / cols);
if (error) if (error)
@ -577,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
* while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
* \a data pointers. * \a data pointers.
* *
* Here is an example using strides:
* \include Matrix_Map_stride.cpp
* Output: \verbinclude Matrix_Map_stride.out
*
* \see class Map * \see class Map
*/ */
//@{ //@{
@ -733,8 +737,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0) EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
{ {
EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) && const bool t0_is_integer_alike = internal::is_valid_index_type<T0>::value;
bool(NumTraits<T1>::IsInteger), const bool t1_is_integer_alike = internal::is_valid_index_type<T1>::value;
EIGEN_STATIC_ASSERT(t0_is_integer_alike &&
t1_is_integer_alike,
FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
resize(rows,cols); resize(rows,cols);
} }
@ -769,9 +775,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
&& ((!internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0) && ((!internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0)
{ {
// NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument. // NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
const bool is_integer = NumTraits<T>::IsInteger; const bool is_integer_alike = internal::is_valid_index_type<T>::value;
EIGEN_UNUSED_VARIABLE(is_integer); EIGEN_UNUSED_VARIABLE(is_integer_alike);
EIGEN_STATIC_ASSERT(is_integer, EIGEN_STATIC_ASSERT(is_integer_alike,
FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
resize(size); resize(size);
} }
@ -812,6 +818,13 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
this->_set_noalias(other); this->_set_noalias(other);
} }
// Initialize an arbitrary matrix from an object convertible to the Derived type.
template<typename T>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE void _init1(const Derived& other){
this->_set_noalias(other);
}
// Initialize an arbitrary matrix from a generic Eigen expression // Initialize an arbitrary matrix from a generic Eigen expression
template<typename T, typename OtherDerived> template<typename T, typename OtherDerived>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
@ -834,7 +847,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
this->derived() = r; this->derived() = r;
} }
// For fixed -size arrays: // For fixed-size Array<Scalar,...>
template<typename T> template<typename T>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE void _init1(const Scalar& val0, EIGEN_STRONG_INLINE void _init1(const Scalar& val0,
@ -846,6 +859,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
Base::setConstant(val0); Base::setConstant(val0);
} }
// For fixed-size Array<Index,...>
template<typename T> template<typename T>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE void _init1(const Index& val0, EIGEN_STRONG_INLINE void _init1(const Index& val0,

View File

@ -97,8 +97,8 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
&& "if you wanted a coeff-wise or a dot product use the respective explicit functions"); && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
} }
EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; } EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; } EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
@ -127,7 +127,7 @@ public:
using Base::derived; using Base::derived;
typedef typename Base::Scalar Scalar; typedef typename Base::Scalar Scalar;
operator const Scalar() const EIGEN_STRONG_INLINE operator const Scalar() const
{ {
return internal::evaluator<ProductXpr>(derived()).coeff(0,0); return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
} }
@ -162,7 +162,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
public: public:
EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const
{ {
EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
@ -170,7 +170,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
return internal::evaluator<Derived>(derived()).coeff(row,col); return internal::evaluator<Derived>(derived()).coeff(row,col);
} }
EIGEN_DEVICE_FUNC Scalar coeff(Index i) const EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const
{ {
EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );

View File

@ -32,7 +32,7 @@ struct evaluator<Product<Lhs, Rhs, Options> >
typedef Product<Lhs, Rhs, Options> XprType; typedef Product<Lhs, Rhs, Options> XprType;
typedef product_evaluator<XprType> Base; typedef product_evaluator<XprType> Base;
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
}; };
// Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B" // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
@ -55,7 +55,7 @@ struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
const Product<Lhs, Rhs, DefaultProduct> > XprType; const Product<Lhs, Rhs, DefaultProduct> > XprType;
typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base; typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
: Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs()) : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
{} {}
}; };
@ -68,7 +68,7 @@ struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType; typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base; typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
: Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>( : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()), Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
xpr.index() )) xpr.index() ))
@ -207,6 +207,12 @@ struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename
static const bool value = true; static const bool value = true;
}; };
template<typename OtherXpr, typename Lhs, typename Rhs>
struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_difference_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
static const bool value = true;
};
template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2> template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
struct assignment_from_xpr_op_product struct assignment_from_xpr_op_product
{ {
@ -240,19 +246,19 @@ template<typename Lhs, typename Rhs>
struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct> struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
{ {
template<typename Dst> template<typename Dst>
static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum(); dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
} }
template<typename Dst> template<typename Dst>
static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum(); dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
} }
template<typename Dst> template<typename Dst>
static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); } { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
}; };
@ -306,25 +312,25 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
}; };
template<typename Dst> template<typename Dst>
static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>()); internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
} }
template<typename Dst> template<typename Dst>
static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>()); internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
} }
template<typename Dst> template<typename Dst>
static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>()); internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
} }
template<typename Dst> template<typename Dst>
static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
{ {
internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>()); internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
} }
@ -390,7 +396,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
// but easier on the compiler side // but easier on the compiler side
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>()); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
} }
template<typename Dst> template<typename Dst>
static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
@ -404,6 +410,32 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
// dst.noalias() -= lhs.lazyProduct(rhs); // dst.noalias() -= lhs.lazyProduct(rhs);
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>()); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
} }
// Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
// dst {,+,-}= s * (A.lazyProduct(B))
// This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
// For them, this strategy is also faster than simply by-passing the heap allocation through
// stack allocation.
// For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
// and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
// that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
{
call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
}
// Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
// overload more specialized.
template<typename Dst, typename LhsT, typename Func>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
{
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
}
// template<typename Dst> // template<typename Dst>
// static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) // static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
@ -779,7 +811,11 @@ public:
_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
_LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0), Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
Alignment = evaluator<MatrixType>::Alignment Alignment = evaluator<MatrixType>::Alignment,
AsScalarProduct = (DiagonalType::SizeAtCompileTime==1)
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
}; };
diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
@ -791,7 +827,10 @@ public:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
{ {
return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); if(AsScalarProduct)
return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
else
return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
} }
protected: protected:

View File

@ -407,7 +407,7 @@ protected:
*/ */
template<typename Derived> template<typename Derived>
template<typename Func> template<typename Func>
typename internal::traits<Derived>::Scalar EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
DenseBase<Derived>::redux(const Func& func) const DenseBase<Derived>::redux(const Func& func) const
{ {
eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");

View File

@ -28,12 +28,13 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
template<typename Derived> struct match { template<typename Derived> struct match {
enum { enum {
IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime,
HasDirectAccess = internal::has_direct_access<Derived>::ret, HasDirectAccess = internal::has_direct_access<Derived>::ret,
StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), StorageOrderMatch = IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic) InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
|| int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime) || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
|| (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1), || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
OuterStrideMatch = Derived::IsVectorAtCompileTime OuterStrideMatch = IsVectorAtCompileTime
|| int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime), || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
// NOTE, this indirection of evaluator<Derived>::Alignment is needed // NOTE, this indirection of evaluator<Derived>::Alignment is needed
// to workaround a very strange bug in MSVC related to the instantiation // to workaround a very strange bug in MSVC related to the instantiation
@ -95,6 +96,8 @@ protected:
template<typename Expression> template<typename Expression>
EIGEN_DEVICE_FUNC void construct(Expression& expr) EIGEN_DEVICE_FUNC void construct(Expression& expr)
{ {
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression);
if(PlainObjectType::RowsAtCompileTime==1) if(PlainObjectType::RowsAtCompileTime==1)
{ {
eigen_assert(expr.rows()==1 || expr.cols()==1); eigen_assert(expr.rows()==1 || expr.cols()==1);

View File

@ -71,7 +71,9 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix) explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
{} {
EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY);
}
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline Index rows() const { return m_matrix.rows(); } inline Index rows() const { return m_matrix.rows(); }
@ -189,7 +191,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2); TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
} }
typedef SelfAdjointView<const MatrixConjugateReturnType,Mode> ConjugateReturnType; typedef SelfAdjointView<const MatrixConjugateReturnType,UpLo> ConjugateReturnType;
/** \sa MatrixBase::conjugate() const */ /** \sa MatrixBase::conjugate() const */
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline const ConjugateReturnType conjugate() const inline const ConjugateReturnType conjugate() const

View File

@ -15,33 +15,29 @@ namespace Eigen {
// TODO generalize the scalar type of 'other' // TODO generalize the scalar type of 'other'
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
{ {
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>()); internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
return derived(); return derived();
} }
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
{ {
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>()); internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
return derived(); return derived();
} }
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
{ {
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>()); internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
return derived(); return derived();
} }
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
{ {
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>()); internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
return derived(); return derived();
} }

View File

@ -34,12 +34,12 @@ template<typename Decomposition, typename RhsType,typename StorageKind> struct s
template<typename Decomposition, typename RhsType> template<typename Decomposition, typename RhsType>
struct solve_traits<Decomposition,RhsType,Dense> struct solve_traits<Decomposition,RhsType,Dense>
{ {
typedef Matrix<typename RhsType::Scalar, typedef typename make_proper_matrix_type<typename RhsType::Scalar,
Decomposition::ColsAtCompileTime, Decomposition::ColsAtCompileTime,
RhsType::ColsAtCompileTime, RhsType::ColsAtCompileTime,
RhsType::PlainObject::Options, RhsType::PlainObject::Options,
Decomposition::MaxColsAtCompileTime, Decomposition::MaxColsAtCompileTime,
RhsType::MaxColsAtCompileTime> PlainObject; RhsType::MaxColsAtCompileTime>::type PlainObject;
}; };
template<typename Decomposition, typename RhsType> template<typename Decomposition, typename RhsType>

View File

@ -19,7 +19,7 @@ namespace internal {
template<typename LhsScalar, typename RhsScalar, typename Index, int Side, int Mode, bool Conjugate, int StorageOrder> template<typename LhsScalar, typename RhsScalar, typename Index, int Side, int Mode, bool Conjugate, int StorageOrder>
struct triangular_solve_vector; struct triangular_solve_vector;
template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder> template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder, int OtherInnerStride>
struct triangular_solve_matrix; struct triangular_solve_matrix;
// small helper struct extracting some traits on the underlying solver operation // small helper struct extracting some traits on the underlying solver operation
@ -98,8 +98,8 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false); BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor, triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor> (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor, Rhs::InnerStrideAtCompileTime>
::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.outerStride(), blocking); ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.innerStride(), rhs.outerStride(), blocking);
} }
}; };
@ -169,6 +169,9 @@ void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<Ot
OtherDerived& other = _other.const_cast_derived(); OtherDerived& other = _other.const_cast_derived();
eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) ); eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower))); eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
// If solving for a 0x0 matrix, nothing to do, simply return.
if (derived().cols() == 0)
return;
enum { copy = (internal::traits<OtherDerived>::Flags & RowMajorBit) && OtherDerived::IsVectorAtCompileTime && OtherDerived::SizeAtCompileTime!=1}; enum { copy = (internal::traits<OtherDerived>::Flags & RowMajorBit) && OtherDerived::IsVectorAtCompileTime && OtherDerived::SizeAtCompileTime!=1};
typedef typename internal::conditional<copy, typedef typename internal::conditional<copy,

View File

@ -165,12 +165,13 @@ MatrixBase<Derived>::stableNorm() const
typedef typename internal::nested_eval<Derived,2>::type DerivedCopy; typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean; typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
DerivedCopy copy(derived()); const DerivedCopy copy(derived());
enum { enum {
CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit) CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit)
|| (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
&& (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
}; };
typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>, typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper; typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;

View File

@ -146,6 +146,8 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
{ {
return derived().nestedExpression().coeffRef(index); return derived().nestedExpression().coeffRef(index);
} }
protected:
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TransposeImpl)
}; };
/** \returns an expression of the transpose of *this. /** \returns an expression of the transpose of *this.

View File

@ -33,17 +33,6 @@ class TranspositionsBase
indices() = other.indices(); indices() = other.indices();
return derived(); return derived();
} }
#ifndef EIGEN_PARSED_BY_DOXYGEN
/** This is a special case of the templated operator=. Its purpose is to
* prevent a default operator= from hiding the templated operator=.
*/
Derived& operator=(const TranspositionsBase& other)
{
indices() = other.indices();
return derived();
}
#endif
/** \returns the number of transpositions */ /** \returns the number of transpositions */
Index size() const { return indices().size(); } Index size() const { return indices().size(); }
@ -171,12 +160,6 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
inline Transpositions(const TranspositionsBase<OtherDerived>& other) inline Transpositions(const TranspositionsBase<OtherDerived>& other)
: m_indices(other.indices()) {} : m_indices(other.indices()) {}
#ifndef EIGEN_PARSED_BY_DOXYGEN
/** Standard copy constructor. Defined only to prevent a default copy constructor
* from hiding the other templated constructor */
inline Transpositions(const Transpositions& other) : m_indices(other.indices()) {}
#endif
/** Generic constructor from expression of the transposition indices. */ /** Generic constructor from expression of the transposition indices. */
template<typename Other> template<typename Other>
explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices) explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices)
@ -189,17 +172,6 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
return Base::operator=(other); return Base::operator=(other);
} }
#ifndef EIGEN_PARSED_BY_DOXYGEN
/** This is a special case of the templated operator=. Its purpose is to
* prevent a default operator= from hiding the templated operator=.
*/
Transpositions& operator=(const Transpositions& other)
{
m_indices = other.m_indices;
return *this;
}
#endif
/** Constructs an uninitialized permutation matrix of given size. /** Constructs an uninitialized permutation matrix of given size.
*/ */
inline Transpositions(Index size) : m_indices(size) inline Transpositions(Index size) : m_indices(size)
@ -306,17 +278,6 @@ class TranspositionsWrapper
return Base::operator=(other); return Base::operator=(other);
} }
#ifndef EIGEN_PARSED_BY_DOXYGEN
/** This is a special case of the templated operator=. Its purpose is to
* prevent a default operator= from hiding the templated operator=.
*/
TranspositionsWrapper& operator=(const TranspositionsWrapper& other)
{
m_indices = other.m_indices;
return *this;
}
#endif
/** const version of indices(). */ /** const version of indices(). */
const IndicesType& indices() const { return m_indices; } const IndicesType& indices() const { return m_indices; }
@ -384,7 +345,7 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
const Product<OtherDerived, Transpose, AliasFreeProduct> const Product<OtherDerived, Transpose, AliasFreeProduct>
operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt) operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt)
{ {
return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt.derived()); return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt);
} }
/** \returns the \a matrix with the inverse transpositions applied to the rows. /** \returns the \a matrix with the inverse transpositions applied to the rows.

View File

@ -217,9 +217,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix) explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
{} {}
using Base::operator=; EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView)
TriangularView& operator=(const TriangularView &other)
{ return Base::operator=(other); }
/** \copydoc EigenBase::rows() */ /** \copydoc EigenBase::rows() */
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
@ -544,6 +542,10 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
template<typename ProductType> template<typename ProductType>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta); EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta);
protected:
EIGEN_DEFAULT_COPY_CONSTRUCTOR(TriangularViewImpl)
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TriangularViewImpl)
}; };
/*************************************************************************** /***************************************************************************

View File

@ -204,23 +204,7 @@ template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
} }
}; };
template<> struct conj_helper<Packet8f, Packet4cf, false,false> EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
{ return Packet4cf(Eigen::internal::pmul(x, y.v)); }
};
template<> struct conj_helper<Packet4cf, Packet8f, false,false>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
{ return Packet4cf(Eigen::internal::pmul(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b) template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
{ {
@ -400,23 +384,7 @@ template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
} }
}; };
template<> struct conj_helper<Packet4d, Packet2cd, false,false> EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
{ return Packet2cd(Eigen::internal::pmul(x, y.v)); }
};
template<> struct conj_helper<Packet2cd, Packet4d, false,false>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
{ return Packet2cd(Eigen::internal::pmul(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b) template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
{ {

View File

@ -159,11 +159,12 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co
#ifdef __FMA__ #ifdef __FMA__
template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) #if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
// clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
// and gcc stupidly generates a vfmadd132ps instruction, // and even register spilling with clang>=6.0 (bug 1637).
// so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate // Gcc stupidly generates a vfmadd132ps instruction.
// the result of the product. // So let's enforce it to generate a vfmadd231ps instruction since the most common use
// case is to accumulate the result of the product.
Packet8f res = c; Packet8f res = c;
__asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
return res; return res;
@ -172,7 +173,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f&
#endif #endif
} }
template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) #if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
// see above // see above
Packet4d res = c; Packet4d res = c;
__asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
@ -308,9 +309,9 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
} }
#ifndef EIGEN_VECTORIZE_AVX512 #ifndef EIGEN_VECTORIZE_AVX512
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
#endif #endif
template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) { template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
@ -333,9 +334,12 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
{ {
__m256d tmp = _mm256_shuffle_pd(a,a,5); __m256d tmp = _mm256_shuffle_pd(a,a,5);
return _mm256_permute2f128_pd(tmp, tmp, 1); return _mm256_permute2f128_pd(tmp, tmp, 1);
#if 0
// This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
// exhibit the same latency/throughput, but it is here for future reference/benchmarking...
__m256d swap_halves = _mm256_permute2f128_pd(a,a,1); __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
return _mm256_permute_pd(swap_halves,5); return _mm256_permute_pd(swap_halves,5);
#endif
} }
// pabs should be ok // pabs should be ok

View File

@ -29,6 +29,7 @@ namespace internal {
#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \ #define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X)) const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
// Natural logarithm // Natural logarithm
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
@ -47,6 +48,7 @@ plog<Packet16f>(const Packet16f& _x) {
// The smallest non denormalized float number. // The smallest non denormalized float number.
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000); _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000); _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
// Polynomial coefficients. // Polynomial coefficients.
@ -64,11 +66,9 @@ plog<Packet16f>(const Packet16f& _x) {
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f); _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);
// invalid_mask is set to true when x is NaN // invalid_mask is set to true when x is NaN
__mmask16 invalid_mask = __mmask16 invalid_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
_mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ); __mmask16 iszero_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ);
__mmask16 iszero_mask =
_mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ);
// Truncate input values to the minimum positive normal. // Truncate input values to the minimum positive normal.
x = pmax(x, p16f_min_norm_pos); x = pmax(x, p16f_min_norm_pos);
@ -88,9 +88,9 @@ plog<Packet16f>(const Packet16f& _x) {
// x = x + x - 1.0; // x = x + x - 1.0;
// } else { x = x - 1.0; } // } else { x = x - 1.0; }
__mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ); __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps()); Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x);
x = psub(x, p16f_1); x = psub(x, p16f_1);
e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps())); e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1));
x = padd(x, tmp); x = padd(x, tmp);
Packet16f x2 = pmul(x, x); Packet16f x2 = pmul(x, x);
@ -118,10 +118,18 @@ plog<Packet16f>(const Packet16f& _x) {
x = padd(x, y); x = padd(x, y);
x = padd(x, y2); x = padd(x, y2);
// Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf, // Filter out invalid inputs, i.e.:
_mm512_mask_blend_ps(invalid_mask, p16f_nan, x)); // - negative arg will be NAN,
// - 0 will be -INF.
// - +INF will be +INF
return _mm512_mask_blend_ps(iszero_mask,
_mm512_mask_blend_ps(invalid_mask,
_mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
p16f_nan),
p16f_minus_inf);
} }
#endif #endif
// Exponential function. Works by writing "x = m*log(2) + r" where // Exponential function. Works by writing "x = m*log(2) + r" where
@ -257,50 +265,39 @@ pexp<Packet8d>(const Packet8d& _x) {
template <> template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
psqrt<Packet16f>(const Packet16f& _x) { psqrt<Packet16f>(const Packet16f& _x) {
_EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
_EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); __mmask16 denormal_mask = _mm512_kand(
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
_CMP_LT_OQ),
_mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
Packet16f neg_half = pmul(_x, p16f_minus_half); Packet16f x = _mm512_rsqrt14_ps(_x);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x),
_mm512_setzero_ps());
// Do a single step of Newton's iteration. // Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
// Multiply the original _x by it's reciprocal square root to extract the // Flush results for denormals to zero.
// square root. return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
return pmul(_x, x);
} }
template <> template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
psqrt<Packet8d>(const Packet8d& _x) { psqrt<Packet8d>(const Packet8d& _x) {
_EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5));
_EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); __mmask16 denormal_mask = _mm512_kand(
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
_CMP_LT_OQ),
_mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
Packet8d neg_half = pmul(_x, p8d_minus_half); Packet8d x = _mm512_rsqrt14_pd(_x);
// select only the inverse sqrt of positive normal inputs (denormals are // Do a single step of Newton's iteration.
// flushed to zero and cause infs as well). x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
__mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x),
_mm512_setzero_pd());
// Do a first step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Do a second step of Newton's iteration. // Do a second step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
// Multiply the original _x by it's reciprocal square root to extract the return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
// square root.
return pmul(_x, x);
} }
#else #else
template <> template <>
@ -333,20 +330,18 @@ prsqrt<Packet16f>(const Packet16f& _x) {
// select only the inverse sqrt of positive normal inputs (denormals are // select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well). // flushed to zero and cause infs as well).
__mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ); __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
_mm512_rsqrt14_ps(_x));
// Fill in NaNs and Infs for the negative/zero entries. // Fill in NaNs and Infs for the negative/zero entries.
__mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ); __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
Packet16f infs_and_nans = _mm512_mask_blend_ps( Packet16f infs_and_nans = _mm512_mask_blend_ps(
neg_mask, p16f_nan, neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
_mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps()));
// Do a single step of Newton's iteration. // Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
// Insert NaNs and Infs in all the right places. // Insert NaNs and Infs in all the right places.
return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x); return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
} }
template <> template <>
@ -363,14 +358,12 @@ prsqrt<Packet8d>(const Packet8d& _x) {
// select only the inverse sqrt of positive normal inputs (denormals are // select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well). // flushed to zero and cause infs as well).
__mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ); __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
_mm512_rsqrt14_pd(_x));
// Fill in NaNs and Infs for the negative/zero entries. // Fill in NaNs and Infs for the negative/zero entries.
__mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ); __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
Packet8d infs_and_nans = _mm512_mask_blend_pd( Packet8d infs_and_nans = _mm512_mask_blend_pd(
neg_mask, p8d_nan, neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
_mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd()));
// Do a first step of Newton's iteration. // Do a first step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
@ -379,9 +372,9 @@ prsqrt<Packet8d>(const Packet8d& _x) {
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Insert NaNs and Infs in all the right places. // Insert NaNs and Infs in all the right places.
return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x); return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
} }
#else #elif defined(EIGEN_VECTORIZE_AVX512ER)
template <> template <>
EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) { EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
return _mm512_rsqrt28_ps(x); return _mm512_rsqrt28_ps(x);

View File

@ -19,10 +19,10 @@ namespace internal {
#endif #endif
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
#endif #endif
#ifdef __FMA__ #ifdef EIGEN_VECTORIZE_FMA
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif #endif
@ -54,13 +54,14 @@ template<> struct packet_traits<float> : default_packet_traits
AlignedOnScalar = 1, AlignedOnScalar = 1,
size = 16, size = 16,
HasHalfPacket = 1, HasHalfPacket = 1,
#if EIGEN_GNUC_AT_LEAST(5, 3) HasBlend = 0,
#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
#ifdef EIGEN_VECTORIZE_AVX512DQ #ifdef EIGEN_VECTORIZE_AVX512DQ
HasLog = 1, HasLog = 1,
#endif #endif
HasExp = 1, HasExp = 1,
HasSqrt = 1, HasSqrt = EIGEN_FAST_MATH,
HasRsqrt = 1, HasRsqrt = EIGEN_FAST_MATH,
#endif #endif
HasDiv = 1 HasDiv = 1
}; };
@ -74,8 +75,8 @@ template<> struct packet_traits<double> : default_packet_traits
AlignedOnScalar = 1, AlignedOnScalar = 1,
size = 8, size = 8,
HasHalfPacket = 1, HasHalfPacket = 1,
#if EIGEN_GNUC_AT_LEAST(5, 3) #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
HasSqrt = 1, HasSqrt = EIGEN_FAST_MATH,
HasRsqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH,
#endif #endif
HasDiv = 1 HasDiv = 1
@ -98,6 +99,7 @@ template <>
struct unpacket_traits<Packet16f> { struct unpacket_traits<Packet16f> {
typedef float type; typedef float type;
typedef Packet8f half; typedef Packet8f half;
typedef Packet16i integer_packet;
enum { size = 16, alignment=Aligned64 }; enum { size = 16, alignment=Aligned64 };
}; };
template <> template <>
@ -132,7 +134,7 @@ EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
} }
template <> template <>
EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) { EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
return _mm512_broadcastsd_pd(_mm_load_pd1(from)); return _mm512_set1_pd(*from);
} }
template <> template <>
@ -158,6 +160,11 @@ EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
const Packet8d& b) { const Packet8d& b) {
return _mm512_add_pd(a, b); return _mm512_add_pd(a, b);
} }
template <>
EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a,
const Packet16i& b) {
return _mm512_add_epi32(a, b);
}
template <> template <>
EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a,
@ -169,6 +176,11 @@ EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a,
const Packet8d& b) { const Packet8d& b) {
return _mm512_sub_pd(a, b); return _mm512_sub_pd(a, b);
} }
template <>
EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a,
const Packet16i& b) {
return _mm512_sub_epi32(a, b);
}
template <> template <>
EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) { EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
@ -202,6 +214,11 @@ EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a,
const Packet8d& b) { const Packet8d& b) {
return _mm512_mul_pd(a, b); return _mm512_mul_pd(a, b);
} }
template <>
EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a,
const Packet16i& b) {
return _mm512_mul_epi32(a, b);
}
template <> template <>
EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a, EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
@ -214,7 +231,7 @@ EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
return _mm512_div_pd(a, b); return _mm512_div_pd(a, b);
} }
#ifdef __FMA__ #ifdef EIGEN_VECTORIZE_FMA
template <> template <>
EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b, EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
const Packet16f& c) { const Packet16f& c) {
@ -230,23 +247,73 @@ EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b,
template <> template <>
EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a, EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a,
const Packet16f& b) { const Packet16f& b) {
return _mm512_min_ps(a, b); // Arguments are reversed to match NaN propagation behavior of std::min.
return _mm512_min_ps(b, a);
} }
template <> template <>
EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a, EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a,
const Packet8d& b) { const Packet8d& b) {
return _mm512_min_pd(a, b); // Arguments are reversed to match NaN propagation behavior of std::min.
return _mm512_min_pd(b, a);
} }
template <> template <>
EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a, EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a,
const Packet16f& b) { const Packet16f& b) {
return _mm512_max_ps(a, b); // Arguments are reversed to match NaN propagation behavior of std::max.
return _mm512_max_ps(b, a);
} }
template <> template <>
EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a, EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
const Packet8d& b) { const Packet8d& b) {
return _mm512_max_pd(a, b); // Arguments are reversed to match NaN propagation behavior of std::max.
return _mm512_max_pd(b, a);
}
#ifdef EIGEN_VECTORIZE_AVX512DQ
template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); }
#else
// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_));
}
// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_));
}
EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)),
_mm256_castps_si256(b),1));
}
#endif
// Helper function for bit packing snippet of low precision comparison.
// It packs the flags from 32x16 to 16x16.
EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) {
// Split data into small pieces and handle with AVX instructions
// to guarantee internal order of vector.
// Operation:
// dst[15:0] := Saturate16(rf[31:0])
// dst[31:16] := Saturate16(rf[63:32])
// ...
// dst[255:240] := Saturate16(rf[255:224])
__m256i lo = _mm256_castps_si256(extract256<0>(rf));
__m256i hi = _mm256_castps_si256(extract256<1>(rf));
__m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
_mm256_extractf128_si256(lo, 1));
__m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
_mm256_extractf128_si256(hi, 1));
return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
}
template <>
EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
const Packet16i& b) {
return _mm512_and_si512(a,b);
} }
template <> template <>
@ -255,24 +322,7 @@ EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
#ifdef EIGEN_VECTORIZE_AVX512DQ #ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_and_ps(a, b); return _mm512_and_ps(a, b);
#else #else
Packet16f res = _mm512_undefined_ps(); return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b)));
Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0);
Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1);
Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2);
Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3);
return res;
#endif #endif
} }
template <> template <>
@ -288,35 +338,21 @@ EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a,
Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
res = _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1); return _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
return res;
#endif #endif
} }
template <> template <>
EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, EIGEN_STRONG_INLINE Packet16i por<Packet16i>(const Packet16i& a, const Packet16i& b) {
const Packet16f& b) { return _mm512_or_si512(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, const Packet16f& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ #ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_or_ps(a, b); return _mm512_or_ps(a, b);
#else #else
Packet16f res = _mm512_undefined_ps(); return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b)));
Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0);
Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1);
Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2);
Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3);
return res;
#endif #endif
} }
@ -326,109 +362,67 @@ EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
#ifdef EIGEN_VECTORIZE_AVX512DQ #ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_or_pd(a, b); return _mm512_or_pd(a, b);
#else #else
Packet8d res = _mm512_undefined_pd(); return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0);
Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1);
return res;
#endif #endif
} }
template <> template <>
EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, EIGEN_STRONG_INLINE Packet16i pxor<Packet16i>(const Packet16i& a, const Packet16i& b) {
const Packet16f& b) { return _mm512_xor_si512(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, const Packet16f& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ #ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_xor_ps(a, b); return _mm512_xor_ps(a, b);
#else #else
Packet16f res = _mm512_undefined_ps(); return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b)));
Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0);
Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1);
Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2);
Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3);
return res;
#endif #endif
} }
template <> template <>
EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, const Packet8d& b) {
const Packet8d& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ #ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_xor_pd(a, b); return _mm512_xor_pd(a, b);
#else #else
Packet8d res = _mm512_undefined_pd(); return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0);
Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1);
return res;
#endif #endif
} }
template <> template <>
EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, EIGEN_STRONG_INLINE Packet16i pandnot<Packet16i>(const Packet16i& a, const Packet16i& b) {
const Packet16f& b) { return _mm512_andnot_si512(b, a);
}
template <>
EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, const Packet16f& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ #ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_andnot_ps(a, b); return _mm512_andnot_ps(b, a);
#else #else
Packet16f res = _mm512_undefined_ps(); return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b)));
Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0);
Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1);
Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2);
Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3);
return res;
#endif #endif
} }
template <> template <>
EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a, EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,const Packet8d& b) {
const Packet8d& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ #ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_andnot_pd(a, b); return _mm512_andnot_pd(b, a);
#else #else
Packet8d res = _mm512_undefined_pd(); return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0);
Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1);
return res;
#endif #endif
} }
template<int N> EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
return _mm512_srai_epi32(a, N);
}
template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) {
return _mm512_srli_epi32(a, N);
}
template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
return _mm512_slli_epi32(a, N);
}
template <> template <>
EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) { EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from); EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from);
@ -461,75 +455,55 @@ EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
// {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7} // {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
template <> template <>
EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) { EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
Packet8f lane0 = _mm256_broadcast_ps((const __m128*)(const void*)from); // an unaligned load is required here as there is no requirement
// mimic an "inplace" permutation of the lower 128bits using a blend // on the alignment of input pointer 'from'
lane0 = _mm256_blend_ps( __m256i low_half = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
lane0, _mm256_castps128_ps256(_mm_permute_ps( __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
_mm256_castps256_ps128(lane0), _MM_SHUFFLE(1, 0, 1, 0))), __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
15); return pairs;
// then we can perform a consistent permutation on the global register to get }
// everything in shape:
lane0 = _mm256_permute_ps(lane0, _MM_SHUFFLE(3, 3, 2, 2));
Packet8f lane1 = _mm256_broadcast_ps((const __m128*)(const void*)(from + 4));
// mimic an "inplace" permutation of the lower 128bits using a blend
lane1 = _mm256_blend_ps(
lane1, _mm256_castps128_ps256(_mm_permute_ps(
_mm256_castps256_ps128(lane1), _MM_SHUFFLE(1, 0, 1, 0))),
15);
// then we can perform a consistent permutation on the global register to get
// everything in shape:
lane1 = _mm256_permute_ps(lane1, _MM_SHUFFLE(3, 3, 2, 2));
#ifdef EIGEN_VECTORIZE_AVX512DQ #ifdef EIGEN_VECTORIZE_AVX512DQ
Packet16f res = _mm512_undefined_ps(); // FIXME: this does not look optimal, better load a Packet4d and shuffle...
return _mm512_insertf32x8(res, lane0, 0);
return _mm512_insertf32x8(res, lane1, 1);
return res;
#else
Packet16f res = _mm512_undefined_ps();
res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 0), 0);
res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 1), 1);
res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 0), 2);
res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 1), 3);
return res;
#endif
}
// Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, // Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3,
// a3} // a3}
template <> template <>
EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) { EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
Packet4d lane0 = _mm256_broadcast_pd((const __m128d*)(const void*)from); __m512d x = _mm512_setzero_pd();
lane0 = _mm256_permute_pd(lane0, 3 << 2); x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0);
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1);
Packet4d lane1 = _mm256_broadcast_pd((const __m128d*)(const void*)(from + 2)); x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2);
lane1 = _mm256_permute_pd(lane1, 3 << 2); x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
return x;
Packet8d res = _mm512_undefined_pd();
res = _mm512_insertf64x4(res, lane0, 0);
return _mm512_insertf64x4(res, lane1, 1);
} }
#else
template <>
EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
__m512d x = _mm512_setzero_pd();
x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
return x;
}
#endif
// Loads 4 floats from memory a returns the packet // Loads 4 floats from memory a returns the packet
// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3} // {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
template <> template <>
EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) { EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
Packet16f tmp = _mm512_undefined_ps(); Packet16f tmp = _mm512_castps128_ps512(ploadu<Packet4f>(from));
tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0); const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1); return _mm512_permutexvar_ps(scatter_mask, tmp);
tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2);
tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3);
return tmp;
} }
// Loads 2 doubles from memory a returns the packet // Loads 2 doubles from memory a returns the packet
// {a0, a0 a0, a0, a1, a1, a1, a1} // {a0, a0 a0, a0, a1, a1, a1, a1}
template <> template <>
EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) { EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
Packet8d tmp = _mm512_undefined_pd(); __m256d lane0 = _mm256_set1_pd(*from);
Packet2d tmp0 = _mm_load_pd1(from); __m256d lane1 = _mm256_set1_pd(*(from+1));
Packet2d tmp1 = _mm_load_pd1(from + 1); __m512d tmp = _mm512_undefined_pd();
Packet4d lane0 = _mm256_broadcastsd_pd(tmp0);
Packet4d lane1 = _mm256_broadcastsd_pd(tmp1);
tmp = _mm512_insertf64x4(tmp, lane0, 0); tmp = _mm512_insertf64x4(tmp, lane0, 0);
return _mm512_insertf64x4(tmp, lane1, 1); return _mm512_insertf64x4(tmp, lane1, 1);
} }
@ -565,7 +539,7 @@ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
template <> template <>
EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from, EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
Index stride) { Index stride) {
Packet16i stride_vector = _mm512_set1_epi32(stride); Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
Packet16i stride_multiplier = Packet16i stride_multiplier =
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@ -575,7 +549,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
template <> template <>
EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from, EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
Index stride) { Index stride) {
Packet8i stride_vector = _mm256_set1_epi32(stride); Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
@ -586,7 +560,7 @@ template <>
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to, EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
const Packet16f& from, const Packet16f& from,
Index stride) { Index stride) {
Packet16i stride_vector = _mm512_set1_epi32(stride); Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
Packet16i stride_multiplier = Packet16i stride_multiplier =
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@ -596,7 +570,7 @@ template <>
EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to, EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
const Packet8d& from, const Packet8d& from,
Index stride) { Index stride) {
Packet8i stride_vector = _mm256_set1_epi32(stride); Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
_mm512_i32scatter_pd(to, indices, from, 8); _mm512_i32scatter_pd(to, indices, from, 8);
@ -618,9 +592,9 @@ EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
pstore(to, pa); pstore(to, pa);
} }
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template <> template <>
EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) { EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
@ -648,20 +622,20 @@ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
{ {
// _mm512_abs_ps intrinsic not found, so hack around it // _mm512_abs_ps intrinsic not found, so hack around it
return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff)); return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
} }
template <> template <>
EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
// _mm512_abs_ps intrinsic not found, so hack around it // _mm512_abs_ps intrinsic not found, so hack around it
return (__m512d)_mm512_and_si512((__m512i)a, return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
_mm512_set1_epi64(0x7fffffffffffffff)); _mm512_set1_epi64(0x7fffffffffffffff)));
} }
#ifdef EIGEN_VECTORIZE_AVX512DQ #ifdef EIGEN_VECTORIZE_AVX512DQ
// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
__m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0) __m256 OUTPUT##_1 = \ __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
_mm512_extractf32x8_ps(INPUT, 1) __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
#else #else
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
__m256 OUTPUT##_0 = _mm256_insertf128_ps( \ __m256 OUTPUT##_0 = _mm256_insertf128_ps( \
@ -674,17 +648,136 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
#ifdef EIGEN_VECTORIZE_AVX512DQ #ifdef EIGEN_VECTORIZE_AVX512DQ
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \ OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1);
#else #else
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_undefined_ps(); \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3); OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
#endif #endif
template<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f*
vecs) template <>
EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
__m256 lane0 = _mm512_extractf32x8_ps(a, 0);
__m256 lane1 = _mm512_extractf32x8_ps(a, 1);
Packet8f x = _mm256_add_ps(lane0, lane1);
return predux<Packet8f>(x);
#else
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
__m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3));
sum = _mm_hadd_ps(sum, sum);
sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
return _mm_cvtss_f32(sum);
#endif
}
template <>
EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
__m256d sum = _mm256_add_pd(lane0, lane1);
__m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0)));
}
template <>
EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
return padd(lane0, lane1);
#else
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
Packet4f sum0 = padd(lane0, lane2);
Packet4f sum1 = padd(lane1, lane3);
return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
#endif
}
template <>
EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
Packet4d res = padd(lane0, lane1);
return res;
}
template <>
EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
//#ifdef EIGEN_VECTORIZE_AVX512DQ
#if 0
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
Packet8f res = pmul(lane0, lane1);
res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
#else
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
__m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
#endif
}
template <>
EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
__m256d res = pmul(lane0, lane1);
res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
}
template <>
EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
__m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
}
template <>
EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
__m256d res = _mm256_min_pd(lane0, lane1);
res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
}
template <>
EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
__m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
}
template <>
EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
__m256d res = _mm256_max_pd(lane0, lane1);
res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
}
template<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f* vecs)
{ {
EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0); EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0);
EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1); EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1);
@ -873,174 +966,7 @@ template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs)
return _mm512_insertf64x4(final_output, final_1, 1); return _mm512_insertf64x4(final_output, final_1, 1);
} }
template <>
EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
//#ifdef EIGEN_VECTORIZE_AVX512DQ
#if 0
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
Packet8f sum = padd(lane0, lane1);
Packet8f tmp0 = _mm256_hadd_ps(sum, _mm256_permute2f128_ps(a, a, 1));
tmp0 = _mm256_hadd_ps(tmp0, tmp0);
return pfirst(_mm256_hadd_ps(tmp0, tmp0));
#else
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
Packet4f sum = padd(padd(lane0, lane1), padd(lane2, lane3));
sum = _mm_hadd_ps(sum, sum);
sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
return pfirst(sum);
#endif
}
template <>
EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
Packet4d sum = padd(lane0, lane1);
Packet4d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
return pfirst(_mm256_hadd_pd(tmp0, tmp0));
}
template <>
EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
return padd(lane0, lane1);
#else
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
Packet4f sum0 = padd(lane0, lane2);
Packet4f sum1 = padd(lane1, lane3);
return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
#endif
}
template <>
EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
Packet4d res = padd(lane0, lane1);
return res;
}
template <>
EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
//#ifdef EIGEN_VECTORIZE_AVX512DQ
#if 0
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
Packet8f res = pmul(lane0, lane1);
res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
#else
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
Packet4f res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
#endif
}
template <>
EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
Packet4d res = pmul(lane0, lane1);
res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
}
template <>
EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
Packet4f res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
}
template <>
EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
Packet4d res = _mm256_min_pd(lane0, lane1);
res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
}
template <>
EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
Packet4f res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
}
template <>
EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
Packet4d res = _mm256_max_pd(lane0, lane1);
res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
}
template <int Offset>
struct palign_impl<Offset, Packet16f> {
static EIGEN_STRONG_INLINE void run(Packet16f& first,
const Packet16f& second) {
if (Offset != 0) {
__m512i first_idx = _mm512_set_epi32(
Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11,
Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6,
Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset);
__m512i second_idx =
_mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4,
Offset - 5, Offset - 6, Offset - 7, Offset - 8,
Offset - 9, Offset - 10, Offset - 11, Offset - 12,
Offset - 13, Offset - 14, Offset - 15, Offset - 16);
unsigned short mask = 0xFFFF;
mask <<= (16 - Offset);
first = _mm512_permutexvar_ps(first_idx, first);
Packet16f tmp = _mm512_permutexvar_ps(second_idx, second);
first = _mm512_mask_blend_ps(mask, first, tmp);
}
}
};
template <int Offset>
struct palign_impl<Offset, Packet8d> {
static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) {
if (Offset != 0) {
__m512i first_idx = _mm512_set_epi32(
0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0,
Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset);
__m512i second_idx = _mm512_set_epi32(
0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0,
Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8);
unsigned char mask = 0xFF;
mask <<= (8 - Offset);
first = _mm512_permutexvar_pd(first_idx, first);
Packet8d tmp = _mm512_permutexvar_pd(second_idx, second);
first = _mm512_mask_blend_pd(mask, first, tmp);
}
}
};
#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \ #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
@ -1302,13 +1228,76 @@ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
return Packet16f(); return Packet16f();
} }
template <> template <>
EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/, EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
const Packet8d& /*thenPacket*/, const Packet8d& thenPacket,
const Packet8d& /*elsePacket*/) { const Packet8d& elsePacket) {
assert(false && "To be implemented"); __mmask8 m = (ifPacket.select[0] )
return Packet8d(); | (ifPacket.select[1]<<1)
| (ifPacket.select[2]<<2)
| (ifPacket.select[3]<<3)
| (ifPacket.select[4]<<4)
| (ifPacket.select[5]<<5)
| (ifPacket.select[6]<<6)
| (ifPacket.select[7]<<7);
return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
} }
template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
return _mm512_cvttps_epi32(a);
}
template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
return _mm512_cvtepi32_ps(a);
}
template <int Offset>
struct palign_impl<Offset, Packet16f> {
static EIGEN_STRONG_INLINE void run(Packet16f& first,
const Packet16f& second) {
if (Offset != 0) {
__m512i first_idx = _mm512_set_epi32(
Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11,
Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6,
Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset);
__m512i second_idx =
_mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4,
Offset - 5, Offset - 6, Offset - 7, Offset - 8,
Offset - 9, Offset - 10, Offset - 11, Offset - 12,
Offset - 13, Offset - 14, Offset - 15, Offset - 16);
unsigned short mask = 0xFFFF;
mask <<= (16 - Offset);
first = _mm512_permutexvar_ps(first_idx, first);
Packet16f tmp = _mm512_permutexvar_ps(second_idx, second);
first = _mm512_mask_blend_ps(mask, first, tmp);
}
}
};
template <int Offset>
struct palign_impl<Offset, Packet8d> {
static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) {
if (Offset != 0) {
__m512i first_idx = _mm512_set_epi32(
0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0,
Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset);
__m512i second_idx = _mm512_set_epi32(
0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0,
Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8);
unsigned char mask = 0xFF;
mask <<= (8 - Offset);
first = _mm512_permutexvar_pd(first_idx, first);
Packet8d tmp = _mm512_permutexvar_pd(second_idx, second);
first = _mm512_mask_blend_pd(mask, first, tmp);
}
}
};
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen

View File

@ -65,7 +65,7 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{ {
Packet2cf res; Packet2cf res;
if((ptrdiff_t(&from) % 16) == 0) if((std::ptrdiff_t(&from) % 16) == 0)
res.v = pload<Packet4f>((const float *)&from); res.v = pload<Packet4f>((const float *)&from);
else else
res.v = ploadu<Packet4f>((const float *)&from); res.v = ploadu<Packet4f>((const float *)&from);
@ -224,23 +224,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
} }
}; };
template<> struct conj_helper<Packet4f, Packet2cf, false,false> EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
{ return Packet2cf(internal::pmul<Packet4f>(x, y.v)); }
};
template<> struct conj_helper<Packet2cf, Packet4f, false,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
{ return Packet2cf(internal::pmul<Packet4f>(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{ {
@ -416,23 +400,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
return pconj(internal::pmul(a, b)); return pconj(internal::pmul(a, b));
} }
}; };
template<> struct conj_helper<Packet2d, Packet1cd, false,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
{ return Packet1cd(internal::pmul<Packet2d>(x, y.v)); }
};
template<> struct conj_helper<Packet1cd, Packet2d, false,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
{ return Packet1cd(internal::pmul<Packet2d>(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{ {

View File

@ -90,7 +90,7 @@ static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
#define _EIGEN_MASK_ALIGNMENT 0xfffffff0 #define _EIGEN_MASK_ALIGNMENT 0xfffffff0
#endif #endif
#define _EIGEN_ALIGNED_PTR(x) ((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) #define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
// Handle endianness properly while loading constants // Handle endianness properly while loading constants
// Define global static constants: // Define global static constants:
@ -103,7 +103,7 @@ static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4u
static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
#else #else
static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
@ -388,10 +388,28 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
{
#ifdef __VSX__
Packet4f ret;
__asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
return ret;
#else
return vec_min(a, b);
#endif
}
template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
{
#ifdef __VSX__
Packet4f ret;
__asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
return ret;
#else
return vec_max(a, b);
#endif
}
template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
@ -450,15 +468,15 @@ template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
{ {
Packet4f p; Packet4f p;
if((ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from); if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from);
else p = ploadu<Packet4f>(from); else p = ploadu<Packet4f>(from);
return vec_perm(p, p, p16uc_DUPLICATE32_HI); return vec_perm(p, p, p16uc_DUPLICATE32_HI);
} }
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
{ {
Packet4i p; Packet4i p;
if((ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from); if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from);
else p = ploadu<Packet4i>(from); else p = ploadu<Packet4i>(from);
return vec_perm(p, p, p16uc_DUPLICATE32_HI); return vec_perm(p, p, p16uc_DUPLICATE32_HI);
} }
@ -764,7 +782,7 @@ typedef __vector __bool long Packet2bl;
static Packet2l p2l_ONE = { 1, 1 }; static Packet2l p2l_ONE = { 1, 1 };
static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO); static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ONE = { 1.0, 1.0 };
static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO); static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
static Packet2d p2d_MZERO = { -0.0, -0.0 }; static Packet2d p2d_MZERO = { -0.0, -0.0 };
@ -910,9 +928,19 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const
// for some weird raisons, it has to be overloaded for packet of integers // for some weird raisons, it has to be overloaded for packet of integers
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
{
Packet2d ret;
__asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
return ret;
}
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
{
Packet2d ret;
__asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
return ret;
}
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
@ -935,8 +963,8 @@ template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
{ {
Packet2d p; Packet2d p;
if((ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from); if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from);
else p = ploadu<Packet2d>(from); else p = ploadu<Packet2d>(from);
return vec_splat_dbl<0>(p); return vec_splat_dbl<0>(p);
} }
@ -969,7 +997,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
Packet2d v[2], sum; Packet2d v[2], sum;
v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8)); v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8)); v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
#ifdef _BIG_ENDIAN #ifdef _BIG_ENDIAN
sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8)); sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
#else #else
@ -1022,7 +1050,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)); Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
return vec_sel(elsePacket, thenPacket, mask); return vec_sel(elsePacket, thenPacket, mask);
} }
#endif // __VSX__ #endif // __VSX__

View File

@ -13,7 +13,7 @@
// Redistribution and use in source and binary forms, with or without // Redistribution and use in source and binary forms, with or without
// modification, are permitted. // modification, are permitted.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
@ -29,7 +29,7 @@
// type Eigen::half (inheriting from CUDA's __half struct) with // type Eigen::half (inheriting from CUDA's __half struct) with
// operator overloads such that it behaves basically as an arithmetic // operator overloads such that it behaves basically as an arithmetic
// type. It will be quite slow on CPUs (so it is recommended to stay // type. It will be quite slow on CPUs (so it is recommended to stay
// in fp32 for CPUs, except for simple parameter conversions, I/O // in float32_bits for CPUs, except for simple parameter conversions, I/O
// to disk and the likes), but fast on GPUs. // to disk and the likes), but fast on GPUs.
@ -42,6 +42,7 @@
#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type() #define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
#endif #endif
#include <sstream>
namespace Eigen { namespace Eigen {
@ -50,38 +51,45 @@ struct half;
namespace half_impl { namespace half_impl {
#if !defined(EIGEN_HAS_CUDA_FP16) #if !defined(EIGEN_HAS_CUDA_FP16)
// Make our own __half_raw definition that is similar to CUDA's.
// Make our own __half definition that is similar to CUDA's. struct __half_raw {
struct __half { EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
EIGEN_DEVICE_FUNC __half() {} explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
unsigned short x; unsigned short x;
}; };
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
typedef __half __half_raw;
#endif #endif
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h); EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
struct half_base : public __half { struct half_base : public __half_raw {
EIGEN_DEVICE_FUNC half_base() {} EIGEN_DEVICE_FUNC half_base() {}
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {} EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {} EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
#endif
}; };
} // namespace half_impl } // namespace half_impl
// Class definition. // Class definition.
struct half : public half_impl::half_base { struct half : public half_impl::half_base {
#if !defined(EIGEN_HAS_CUDA_FP16) #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
typedef half_impl::__half __half; typedef half_impl::__half_raw __half_raw;
#endif #endif
EIGEN_DEVICE_FUNC half() {} EIGEN_DEVICE_FUNC half() {}
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
#endif
explicit EIGEN_DEVICE_FUNC half(bool b) explicit EIGEN_DEVICE_FUNC half(bool b)
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
@ -138,71 +146,125 @@ struct half : public half_impl::half_base {
} }
}; };
} // end namespace Eigen
namespace std {
template<>
struct numeric_limits<Eigen::half> {
static const bool is_specialized = true;
static const bool is_signed = true;
static const bool is_integer = false;
static const bool is_exact = false;
static const bool has_infinity = true;
static const bool has_quiet_NaN = true;
static const bool has_signaling_NaN = true;
static const float_denorm_style has_denorm = denorm_present;
static const bool has_denorm_loss = false;
static const std::float_round_style round_style = std::round_to_nearest;
static const bool is_iec559 = false;
static const bool is_bounded = false;
static const bool is_modulo = false;
static const int digits = 11;
static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
static const int radix = 2;
static const int min_exponent = -13;
static const int min_exponent10 = -4;
static const int max_exponent = 16;
static const int max_exponent10 = 4;
static const bool traps = true;
static const bool tinyness_before = false;
static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
static Eigen::half round_error() { return Eigen::half(0.5); }
static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
};
// If std::numeric_limits<T> is specialized, should also specialize
// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
// std::numeric_limits<const volatile T>
// https://stackoverflow.com/a/16519653/
template<>
struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {};
template<>
struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {};
template<>
struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {};
} // end namespace std
namespace Eigen {
namespace half_impl { namespace half_impl {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
// Intrinsics for native fp16 support. Note that on current hardware, // Intrinsics for native fp16 support. Note that on current hardware,
// these are no faster than fp32 arithmetic (you need to use the half2 // these are no faster than float32_bits arithmetic (you need to use the half2
// versions to get the ALU speed increased), but you do save the // versions to get the ALU speed increased), but you do save the
// conversion steps back and forth. // conversion steps back and forth.
__device__ half operator + (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
return __hadd(a, b); return __hadd(a, b);
} }
__device__ half operator * (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
return __hmul(a, b); return __hmul(a, b);
} }
__device__ half operator - (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
return __hsub(a, b); return __hsub(a, b);
} }
__device__ half operator / (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
float num = __half2float(a); float num = __half2float(a);
float denom = __half2float(b); float denom = __half2float(b);
return __float2half(num / denom); return __float2half(num / denom);
} }
__device__ half operator - (const half& a) { EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
return __hneg(a); return __hneg(a);
} }
__device__ half& operator += (half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
a = a + b; a = a + b;
return a; return a;
} }
__device__ half& operator *= (half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
a = a * b; a = a * b;
return a; return a;
} }
__device__ half& operator -= (half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
a = a - b; a = a - b;
return a; return a;
} }
__device__ half& operator /= (half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
a = a / b; a = a / b;
return a; return a;
} }
__device__ bool operator == (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
return __heq(a, b); return __heq(a, b);
} }
__device__ bool operator != (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
return __hne(a, b); return __hne(a, b);
} }
__device__ bool operator < (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
return __hlt(a, b); return __hlt(a, b);
} }
__device__ bool operator <= (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
return __hle(a, b); return __hle(a, b);
} }
__device__ bool operator > (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
return __hgt(a, b); return __hgt(a, b);
} }
__device__ bool operator >= (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
return __hge(a, b); return __hge(a, b);
} }
#else // Emulate support for half floats #else // Emulate support for half floats
// Definitions for CPUs and older CUDA, mostly working through conversion // Definitions for CPUs and older CUDA, mostly working through conversion
// to/from fp32. // to/from float32_bits.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
return half(float(a) + float(b)); return half(float(a) + float(b));
@ -238,10 +300,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b)
return a; return a;
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
return float(a) == float(b); return numext::equal_strict(float(a),float(b));
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
return float(a) != float(b); return numext::not_equal_strict(float(a), float(b));
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
return float(a) < float(b); return float(a) < float(b);
@ -269,34 +331,35 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
// these in hardware. If we need more performance on older/other CPUs, they are // these in hardware. If we need more performance on older/other CPUs, they are
// also possible to vectorize directly. // also possible to vectorize directly.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
__half h; __half_raw h;
h.x = x; h.x = x;
return h; return h;
} }
union FP32 { union float32_bits {
unsigned int u; unsigned int u;
float f; float f;
}; };
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
return __float2half(ff); __half tmp_ff = __float2half(ff);
return *(__half_raw*)&tmp_ff;
#elif defined(EIGEN_HAS_FP16_C) #elif defined(EIGEN_HAS_FP16_C)
__half h; __half_raw h;
h.x = _cvtss_sh(ff, 0); h.x = _cvtss_sh(ff, 0);
return h; return h;
#else #else
FP32 f; f.f = ff; float32_bits f; f.f = ff;
const FP32 f32infty = { 255 << 23 }; const float32_bits f32infty = { 255 << 23 };
const FP32 f16max = { (127 + 16) << 23 }; const float32_bits f16max = { (127 + 16) << 23 };
const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
unsigned int sign_mask = 0x80000000u; unsigned int sign_mask = 0x80000000u;
__half o; __half_raw o;
o.x = static_cast<unsigned short>(0x0u); o.x = static_cast<unsigned short>(0x0u);
unsigned int sign = f.u & sign_mask; unsigned int sign = f.u & sign_mask;
@ -335,17 +398,17 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
#endif #endif
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
return __half2float(h); return __half2float(h);
#elif defined(EIGEN_HAS_FP16_C) #elif defined(EIGEN_HAS_FP16_C)
return _cvtsh_ss(h.x); return _cvtsh_ss(h.x);
#else #else
const FP32 magic = { 113 << 23 }; const float32_bits magic = { 113 << 23 };
const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
FP32 o; float32_bits o;
o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
unsigned int exp = shifted_exp & o.u; // just the exponent unsigned int exp = shifted_exp & o.u; // just the exponent
@ -370,7 +433,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
return (a.x & 0x7fff) == 0x7c00; return (a.x & 0x7fff) == 0x7c00;
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return __hisnan(a); return __hisnan(a);
#else #else
return (a.x & 0x7fff) > 0x7c00; return (a.x & 0x7fff) > 0x7c00;
@ -386,11 +449,15 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
return result; return result;
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
return half(::expf(float(a))); #if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
return half(hexp(a));
#else
return half(::expf(float(a)));
#endif
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return Eigen::half(::hlog(a)); return half(::hlog(a));
#else #else
return half(::logf(float(a))); return half(::logf(float(a)));
#endif #endif
@ -402,7 +469,11 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
return half(::log10f(float(a))); return half(::log10f(float(a)));
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
return half(::sqrtf(float(a))); #if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
return half(hsqrt(a));
#else
return half(::sqrtf(float(a)));
#endif
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
return half(::powf(float(a), float(b))); return half(::powf(float(a), float(b)));
@ -420,14 +491,22 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
return half(::tanhf(float(a))); return half(::tanhf(float(a)));
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
return half(hfloor(a));
#else
return half(::floorf(float(a))); return half(::floorf(float(a)));
#endif
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
return half(hceil(a));
#else
return half(::ceilf(float(a))); return half(::ceilf(float(a)));
#endif
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return __hlt(b, a) ? b : a; return __hlt(b, a) ? b : a;
#else #else
const float f1 = static_cast<float>(a); const float f1 = static_cast<float>(a);
@ -436,7 +515,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
#endif #endif
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return __hlt(a, b) ? b : a; return __hlt(a, b) ? b : a;
#else #else
const float f1 = static_cast<float>(a); const float f1 = static_cast<float>(a);
@ -477,6 +556,13 @@ template<> struct is_arithmetic<half> { enum { value = true }; };
template<> struct NumTraits<Eigen::half> template<> struct NumTraits<Eigen::half>
: GenericNumTraits<Eigen::half> : GenericNumTraits<Eigen::half>
{ {
enum {
IsSigned = true,
IsInteger = false,
IsComplex = false,
RequireInitialization = false
};
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
return half_impl::raw_uint16_to_half(0x0800); return half_impl::raw_uint16_to_half(0x0800);
} }
@ -507,7 +593,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
return Eigen::half(::expf(float(a))); return Eigen::half(::expf(float(a)));
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return Eigen::half(::hlog(a)); return Eigen::half(::hlog(a));
#else #else
return Eigen::half(::logf(float(a))); return Eigen::half(::logf(float(a)));
@ -541,14 +627,18 @@ struct hash<Eigen::half> {
// Add the missing shfl_xor intrinsic // Add the missing shfl_xor intrinsic
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
#if EIGEN_CUDACC_VER < 90000
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width)); return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
#else
return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
#endif
} }
#endif #endif
// ldg() has an overload for __half, but we also need one for Eigen::half. // ldg() has an overload for __half_raw, but we also need one for Eigen::half.
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
return Eigen::half_impl::raw_uint16_to_half( return Eigen::half_impl::raw_uint16_to_half(
__ldg(reinterpret_cast<const unsigned short*>(ptr))); __ldg(reinterpret_cast<const unsigned short*>(ptr)));
@ -556,7 +646,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
#endif #endif
#if defined(__CUDA_ARCH__) #if defined(EIGEN_CUDA_ARCH)
namespace Eigen { namespace Eigen {
namespace numext { namespace numext {

View File

@ -291,7 +291,7 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
EIGEN_DEVICE_FUNC inline void EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<float4,4>& kernel) { ptranspose(PacketBlock<float4,4>& kernel) {
double tmp = kernel.packet[0].y; float tmp = kernel.packet[0].y;
kernel.packet[0].y = kernel.packet[1].x; kernel.packet[0].y = kernel.packet[1].x;
kernel.packet[1].x = tmp; kernel.packet[1].x = tmp;

View File

@ -99,7 +99,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2&
template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
half2 result; half2 result;
result.x = a.x & 0x7FFF7FFF; unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
*(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
return result; return result;
} }
@ -229,7 +230,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2&
#else #else
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2))); return Eigen::half(__float2half_rn(a1 + a2));
#endif #endif
} }
@ -263,7 +264,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const ha
#else #else
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2))); return Eigen::half(__float2half_rn(a1 * a2));
#endif #endif
} }
@ -275,7 +276,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
} }
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 #if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
template<> __device__ EIGEN_STRONG_INLINE template<> __device__ EIGEN_STRONG_INLINE
half2 plog<half2>(const half2& a) { half2 plog<half2>(const half2& a) {

View File

@ -0,0 +1,29 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_ARCH_CONJ_HELPER_H
#define EIGEN_ARCH_CONJ_HELPER_H
#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \
template<> struct conj_helper<PACKET_REAL, PACKET_CPLX, false,false> { \
EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \
{ return padd(c, pmul(x,y)); } \
EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \
{ return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); } \
}; \
\
template<> struct conj_helper<PACKET_CPLX, PACKET_REAL, false,false> { \
EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \
{ return padd(c, pmul(x,y)); } \
EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \
{ return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); } \
};
#endif // EIGEN_ARCH_CONJ_HELPER_H

View File

@ -67,7 +67,7 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{ {
float32x2_t r64; float32x2_t r64;
r64 = vld1_f32((float *)&from); r64 = vld1_f32((const float *)&from);
return Packet2cf(vcombine_f32(r64, r64)); return Packet2cf(vcombine_f32(r64, r64));
} }
@ -142,7 +142,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
} }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((float *)addr); } template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((const float *)addr); }
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{ {
@ -265,6 +265,8 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
} }
}; };
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{ {
// TODO optimize it for NEON // TODO optimize it for NEON
@ -275,7 +277,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
s = vmulq_f32(b.v, b.v); s = vmulq_f32(b.v, b.v);
rev_s = vrev64q_f32(s); rev_s = vrev64q_f32(s);
return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s,rev_s)));
} }
EIGEN_DEVICE_FUNC inline void EIGEN_DEVICE_FUNC inline void
@ -381,7 +383,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ARM_PREFETCH((double *)addr); } template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ARM_PREFETCH((const double *)addr); }
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride) template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
{ {
@ -456,6 +458,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
} }
}; };
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{ {
// TODO optimize it for NEON // TODO optimize it for NEON

View File

@ -36,29 +36,63 @@ namespace internal {
#endif #endif
#endif #endif
#if EIGEN_COMP_MSVC
// In MSVC's arm_neon.h header file, all NEON vector types
// are aliases to the same underlying type __n128.
// We thus have to wrap them to make them different C++ types.
// (See also bug 1428)
template<typename T,int unique_id>
struct eigen_packet_wrapper
{
operator T&() { return m_val; }
operator const T&() const { return m_val; }
eigen_packet_wrapper() {}
eigen_packet_wrapper(const T &v) : m_val(v) {}
eigen_packet_wrapper& operator=(const T &v) {
m_val = v;
return *this;
}
T m_val;
};
typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
typedef eigen_packet_wrapper<int32x4_t ,2> Packet4i;
typedef eigen_packet_wrapper<int32x2_t ,3> Packet2i;
typedef eigen_packet_wrapper<uint32x4_t ,4> Packet4ui;
#else
typedef float32x2_t Packet2f; typedef float32x2_t Packet2f;
typedef float32x4_t Packet4f; typedef float32x4_t Packet4f;
typedef int32x4_t Packet4i; typedef int32x4_t Packet4i;
typedef int32x2_t Packet2i; typedef int32x2_t Packet2i;
typedef uint32x4_t Packet4ui; typedef uint32x4_t Packet4ui;
#endif // EIGEN_COMP_MSVC
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
const Packet4f p4f_##NAME = pset1<Packet4f>(X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X)) const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
const Packet4i p4i_##NAME = pset1<Packet4i>(X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function #if EIGEN_ARCH_ARM64
// which available on LLVM and GCC (at least) // __builtin_prefetch tends to do nothing on ARM64 compilers because the
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC // prefetch instructions there are too detailed for __builtin_prefetch to map
// meaningfully to them.
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
#define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR); #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
#elif defined __pld #elif defined __pld
#define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR) #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
#elif !EIGEN_ARCH_ARM64 #elif EIGEN_ARCH_ARM32
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( " pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
#else #else
// by default no explicit prefetching // by default no explicit prefetching
#define EIGEN_ARM_PREFETCH(ADDR) #define EIGEN_ARM_PREFETCH(ADDR)
@ -83,7 +117,7 @@ template<> struct packet_traits<float> : default_packet_traits
HasSqrt = 0 HasSqrt = 0
}; };
}; };
template<> struct packet_traits<int> : default_packet_traits template<> struct packet_traits<int32_t> : default_packet_traits
{ {
typedef Packet4i type; typedef Packet4i type;
typedef Packet4i half; // Packet2i intrinsics not implemented yet typedef Packet4i half; // Packet2i intrinsics not implemented yet
@ -105,19 +139,19 @@ EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q
EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
#endif #endif
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; template<> struct unpacket_traits<Packet4i> { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); } template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); }
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
{ {
const float32_t f[] = {0, 1, 2, 3}; const float f[] = {0, 1, 2, 3};
Packet4f countdown = vld1q_f32(f); Packet4f countdown = vld1q_f32(f);
return vaddq_f32(pset1<Packet4f>(a), countdown); return vaddq_f32(pset1<Packet4f>(a), countdown);
} }
template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
{ {
const int32_t i[] = {0, 1, 2, 3}; const int32_t i[] = {0, 1, 2, 3};
Packet4i countdown = vld1q_s32(i); Packet4i countdown = vld1q_s32(i);
@ -240,20 +274,20 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con
} }
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); } template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
{ {
float32x2_t lo, hi; float32x2_t lo, hi;
lo = vld1_dup_f32(from); lo = vld1_dup_f32(from);
hi = vld1_dup_f32(from+1); hi = vld1_dup_f32(from+1);
return vcombine_f32(lo, hi); return vcombine_f32(lo, hi);
} }
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
{ {
int32x2_t lo, hi; int32x2_t lo, hi;
lo = vld1_dup_s32(from); lo = vld1_dup_s32(from);
@ -261,11 +295,11 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
return vcombine_s32(lo, hi); return vcombine_s32(lo, hi);
} }
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } template<> EIGEN_STRONG_INLINE void pstore<float> (float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu<float> (float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
{ {
@ -276,7 +310,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
res = vsetq_lane_f32(from[3*stride], res, 3); res = vsetq_lane_f32(from[3*stride], res, 3);
return res; return res;
} }
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
{ {
Packet4i res = pset1<Packet4i>(0); Packet4i res = pset1<Packet4i>(0);
res = vsetq_lane_s32(from[0*stride], res, 0); res = vsetq_lane_s32(from[0*stride], res, 0);
@ -293,7 +327,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co
to[stride*2] = vgetq_lane_f32(from, 2); to[stride*2] = vgetq_lane_f32(from, 2);
to[stride*3] = vgetq_lane_f32(from, 3); to[stride*3] = vgetq_lane_f32(from, 3);
} }
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
{ {
to[stride*0] = vgetq_lane_s32(from, 0); to[stride*0] = vgetq_lane_s32(from, 0);
to[stride*1] = vgetq_lane_s32(from, 1); to[stride*1] = vgetq_lane_s32(from, 1);
@ -301,12 +335,12 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
to[stride*3] = vgetq_lane_s32(from, 3); to[stride*3] = vgetq_lane_s32(from, 3);
} }
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch<float> (const float* addr) { EIGEN_ARM_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ARM_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
// FIXME only store the 2 first elements ? // FIXME only store the 2 first elements ?
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
float32x2_t a_lo, a_hi; float32x2_t a_lo, a_hi;
@ -361,7 +395,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
return sum; return sum;
} }
template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
{ {
int32x2_t a_lo, a_hi, sum; int32x2_t a_lo, a_hi, sum;
@ -408,7 +442,7 @@ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
return vget_lane_f32(prod, 0); return vget_lane_f32(prod, 0);
} }
template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
{ {
int32x2_t a_lo, a_hi, prod; int32x2_t a_lo, a_hi, prod;
@ -436,7 +470,7 @@ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
return vget_lane_f32(min, 0); return vget_lane_f32(min, 0);
} }
template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
{ {
int32x2_t a_lo, a_hi, min; int32x2_t a_lo, a_hi, min;
@ -461,7 +495,7 @@ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
return vget_lane_f32(max, 0); return vget_lane_f32(max, 0);
} }
template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
{ {
int32x2_t a_lo, a_hi, max; int32x2_t a_lo, a_hi, max;

View File

@ -128,7 +128,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
} }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{ {
@ -229,23 +229,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
} }
}; };
template<> struct conj_helper<Packet4f, Packet2cf, false,false> EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
{ return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
};
template<> struct conj_helper<Packet2cf, Packet4f, false,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
{ return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{ {
@ -340,7 +324,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); } template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); } template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
{ {
@ -430,23 +414,7 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
} }
}; };
template<> struct conj_helper<Packet2d, Packet1cd, false,false> EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
{ return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
};
template<> struct conj_helper<Packet1cd, Packet2d, false,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
{ return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{ {

View File

@ -28,7 +28,7 @@ namespace internal {
#endif #endif
#endif #endif
#if (defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004) #if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX
// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
// have overloads for both types without linking error. // have overloads for both types without linking error.
// One solution is to increase ABI version using -fabi-version=4 (or greater). // One solution is to increase ABI version using -fabi-version=4 (or greater).
@ -409,10 +409,16 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double&
pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
} }
#if EIGEN_COMP_PGI
typedef const void * SsePrefetchPtrType;
#else
typedef const char * SsePrefetchPtrType;
#endif
#ifndef EIGEN_VECTORIZE_AVX #ifndef EIGEN_VECTORIZE_AVX
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
#endif #endif
#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
@ -876,4 +882,14 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co
} // end namespace Eigen } // end namespace Eigen
#if EIGEN_COMP_PGI
// PGI++ does not define the following intrinsics in C++ mode.
static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); }
static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); }
static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
#endif
#endif // EIGEN_PACKET_MATH_SSE_H #endif // EIGEN_PACKET_MATH_SSE_H

View File

@ -14,6 +14,7 @@ namespace Eigen {
namespace internal { namespace internal {
#ifndef EIGEN_VECTORIZE_AVX
template <> template <>
struct type_casting_traits<float, int> { struct type_casting_traits<float, int> {
enum { enum {
@ -23,11 +24,6 @@ struct type_casting_traits<float, int> {
}; };
}; };
template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
return _mm_cvttps_epi32(a);
}
template <> template <>
struct type_casting_traits<int, float> { struct type_casting_traits<int, float> {
enum { enum {
@ -37,11 +33,6 @@ struct type_casting_traits<int, float> {
}; };
}; };
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
return _mm_cvtepi32_ps(a);
}
template <> template <>
struct type_casting_traits<double, float> { struct type_casting_traits<double, float> {
enum { enum {
@ -51,10 +42,6 @@ struct type_casting_traits<double, float> {
}; };
}; };
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
}
template <> template <>
struct type_casting_traits<float, double> { struct type_casting_traits<float, double> {
enum { enum {
@ -63,6 +50,19 @@ struct type_casting_traits<float, double> {
TgtCoeffRatio = 2 TgtCoeffRatio = 2
}; };
}; };
#endif
template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
return _mm_cvttps_epi32(a);
}
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
return _mm_cvtepi32_ps(a);
}
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
}
template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) { template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
// Simply discard the second half of the input // Simply discard the second half of the input

View File

@ -336,6 +336,9 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
} }
}; };
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{ {
// TODO optimize it for AltiVec // TODO optimize it for AltiVec

View File

@ -100,7 +100,7 @@ static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
// Mask alignment // Mask alignment
#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 #define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
#define _EIGEN_ALIGNED_PTR(x) ((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) #define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
// Handle endianness properly while loading constants // Handle endianness properly while loading constants
// Define global static constants: // Define global static constants:

View File

@ -28,7 +28,7 @@ template<typename DstScalar,typename SrcScalar> struct assign_op {
{ internal::pstoret<DstScalar,Packet,Alignment>(a,b); } { internal::pstoret<DstScalar,Packet,Alignment>(a,b); }
}; };
// Empty overload for void type (used by PermutationMatrix // Empty overload for void type (used by PermutationMatrix)
template<typename DstScalar> struct assign_op<DstScalar,void> {}; template<typename DstScalar> struct assign_op<DstScalar,void> {};
template<typename DstScalar,typename SrcScalar> template<typename DstScalar,typename SrcScalar>

View File

@ -255,7 +255,7 @@ struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_NEQ> : binary_op_base<LhsScalar,Rh
/** \internal /** \internal
* \brief Template functor to compute the hypot of two scalars * \brief Template functor to compute the hypot of two \b positive \b and \b real scalars
* *
* \sa MatrixBase::stableNorm(), class Redux * \sa MatrixBase::stableNorm(), class Redux
*/ */
@ -263,22 +263,15 @@ template<typename Scalar>
struct scalar_hypot_op<Scalar,Scalar> : binary_op_base<Scalar,Scalar> struct scalar_hypot_op<Scalar,Scalar> : binary_op_base<Scalar,Scalar>
{ {
EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op) EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
// typedef typename NumTraits<Scalar>::Real result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const
{ {
EIGEN_USING_STD_MATH(sqrt) // This functor is used by hypotNorm only for which it is faster to first apply abs
Scalar p, qp; // on all coefficients prior to reduction through hypot.
if(_x>_y) // This way we avoid calling abs on positive and real entries, and this also permits
{ // to seamlessly handle complexes. Otherwise we would have to handle both real and complexes
p = _x; // through the same functor...
qp = _y / p; return internal::positive_real_hypot(x,y);
}
else
{
p = _y;
qp = _x / p;
}
return p * sqrt(Scalar(1) + qp*qp);
} }
}; };
template<typename Scalar> template<typename Scalar>

View File

@ -44,16 +44,16 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
{ {
linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
m_interPacket(plset<Packet>(0)),
m_flip(numext::abs(high)<numext::abs(low)) m_flip(numext::abs(high)<numext::abs(low))
{} {}
template<typename IndexType> template<typename IndexType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const {
typedef typename NumTraits<Scalar>::Real RealScalar;
if(m_flip) if(m_flip)
return (i==0)? m_low : (m_high - (m_size1-i)*m_step); return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step);
else else
return (i==m_size1)? m_high : (m_low + i*m_step); return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step);
} }
template<typename IndexType> template<typename IndexType>
@ -63,7 +63,7 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
// [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) ) // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
if(m_flip) if(m_flip)
{ {
Packet pi = padd(pset1<Packet>(Scalar(i-m_size1)),m_interPacket); Packet pi = plset<Packet>(Scalar(i-m_size1));
Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi)); Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
if(i==0) if(i==0)
res = pinsertfirst(res, m_low); res = pinsertfirst(res, m_low);
@ -71,7 +71,7 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
} }
else else
{ {
Packet pi = padd(pset1<Packet>(Scalar(i)),m_interPacket); Packet pi = plset<Packet>(Scalar(i));
Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi)); Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
if(i==m_size1-unpacket_traits<Packet>::size+1) if(i==m_size1-unpacket_traits<Packet>::size+1)
res = pinsertlast(res, m_high); res = pinsertlast(res, m_high);
@ -83,7 +83,6 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
const Scalar m_high; const Scalar m_high;
const Index m_size1; const Index m_size1;
const Scalar m_step; const Scalar m_step;
const Packet m_interPacket;
const bool m_flip; const bool m_flip;
}; };
@ -93,8 +92,8 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/true>
linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
m_low(low), m_low(low),
m_multiplier((high-low)/convert_index<Scalar>(num_steps<=1 ? 1 : num_steps-1)), m_multiplier((high-low)/convert_index<Scalar>(num_steps<=1 ? 1 : num_steps-1)),
m_divisor(convert_index<Scalar>(num_steps+high-low)/(high-low+1)), m_divisor(convert_index<Scalar>((high>=low?num_steps:-num_steps)+(high-low))/((numext::abs(high-low)+1)==0?1:(numext::abs(high-low)+1))),
m_use_divisor((high+1)<(low+num_steps)) m_use_divisor(num_steps>1 && (numext::abs(high-low)+1)<num_steps)
{} {}
template<typename IndexType> template<typename IndexType>

View File

@ -83,13 +83,17 @@ struct functor_traits<std::binder1st<T> >
{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; }; { enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
#endif #endif
#if (__cplusplus < 201703L) && (EIGEN_COMP_MSVC < 1910)
// std::unary_negate is deprecated since c++17 and will be removed in c++20
template<typename T> template<typename T>
struct functor_traits<std::unary_negate<T> > struct functor_traits<std::unary_negate<T> >
{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; }; { enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
// std::binary_negate is deprecated since c++17 and will be removed in c++20
template<typename T> template<typename T>
struct functor_traits<std::binary_negate<T> > struct functor_traits<std::binary_negate<T> >
{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; }; { enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
#endif
#ifdef EIGEN_STDEXT_SUPPORT #ifdef EIGEN_STDEXT_SUPPORT

View File

@ -768,7 +768,7 @@ struct scalar_sign_op<Scalar,true> {
if (aa==real_type(0)) if (aa==real_type(0))
return Scalar(0); return Scalar(0);
aa = real_type(1)/aa; aa = real_type(1)/aa;
return Scalar(real(a)*aa, imag(a)*aa ); return Scalar(a.real()*aa, a.imag()*aa );
} }
//TODO //TODO
//template <typename Packet> //template <typename Packet>

View File

@ -115,7 +115,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
// registers. However once the latency is hidden there is no point in // registers. However once the latency is hidden there is no point in
// increasing the value of k, so we'll cap it at 320 (value determined // increasing the value of k, so we'll cap it at 320 (value determined
// experimentally). // experimentally).
const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320); // To avoid that k vanishes, we make k_cache at least as big as kr
const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
if (k_cache < k) { if (k_cache < k) {
k = k_cache - (k_cache % kr); k = k_cache - (k_cache % kr);
eigen_internal_assert(k > 0); eigen_internal_assert(k > 0);
@ -648,8 +649,8 @@ public:
// Vectorized path // Vectorized path
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
{ {
dest.first = pset1<RealPacket>(real(*b)); dest.first = pset1<RealPacket>(numext::real(*b));
dest.second = pset1<RealPacket>(imag(*b)); dest.second = pset1<RealPacket>(numext::imag(*b));
} }
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
@ -1197,10 +1198,16 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4"); EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
RhsPacket B_0, B1, B2, B3, T0; RhsPacket B_0, B1, B2, B3, T0;
#define EIGEN_GEBGP_ONESTEP(K) \ // NOTE: the begin/end asm comments below work around bug 935!
// but they are not enough for gcc>=6 without FMA (bug 1637)
#if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
#else
#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
#endif
#define EIGEN_GEBGP_ONESTEP(K) \
do { \ do { \
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
@ -1212,6 +1219,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
traits.madd(A1, B2, C6, B2); \ traits.madd(A1, B2, C6, B2); \
traits.madd(A0, B3, C3, T0); \ traits.madd(A0, B3, C3, T0); \
traits.madd(A1, B3, C7, B3); \ traits.madd(A1, B3, C7, B3); \
EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
} while(false) } while(false)
@ -1526,10 +1534,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
// The following piece of code wont work for 512 bit registers // The following piece of code wont work for 512 bit registers
// Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
// as nr (which is currently 4) for the return type. // as nr (which is currently 4) for the return type.
typedef typename unpacket_traits<SResPacket>::half SResPacketHalf; const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
if ((SwappedTraits::LhsProgress % 4) == 0 && if ((SwappedTraits::LhsProgress % 4) == 0 &&
(SwappedTraits::LhsProgress <= 8) && (SwappedTraits::LhsProgress <= 8) &&
(SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr)) (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
{ {
SAccPacket C0, C1, C2, C3; SAccPacket C0, C1, C2, C3;
straits.initAcc(C0); straits.initAcc(C0);

View File

@ -20,8 +20,9 @@ template<typename _LhsScalar, typename _RhsScalar> class level3_blocking;
template< template<
typename Index, typename Index,
typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor> int ResInnerStride>
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride>
{ {
typedef gebp_traits<RhsScalar,LhsScalar> Traits; typedef gebp_traits<RhsScalar,LhsScalar> Traits;
@ -30,7 +31,7 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
Index rows, Index cols, Index depth, Index rows, Index cols, Index depth,
const LhsScalar* lhs, Index lhsStride, const LhsScalar* lhs, Index lhsStride,
const RhsScalar* rhs, Index rhsStride, const RhsScalar* rhs, Index rhsStride,
ResScalar* res, Index resStride, ResScalar* res, Index resIncr, Index resStride,
ResScalar alpha, ResScalar alpha,
level3_blocking<RhsScalar,LhsScalar>& blocking, level3_blocking<RhsScalar,LhsScalar>& blocking,
GemmParallelInfo<Index>* info = 0) GemmParallelInfo<Index>* info = 0)
@ -39,8 +40,8 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
general_matrix_matrix_product<Index, general_matrix_matrix_product<Index,
RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs, RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs, LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
ColMajor> ColMajor,ResInnerStride>
::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info); ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking,info);
} }
}; };
@ -49,8 +50,9 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
template< template<
typename Index, typename Index,
typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor> int ResInnerStride>
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride>
{ {
typedef gebp_traits<LhsScalar,RhsScalar> Traits; typedef gebp_traits<LhsScalar,RhsScalar> Traits;
@ -59,17 +61,17 @@ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScala
static void run(Index rows, Index cols, Index depth, static void run(Index rows, Index cols, Index depth,
const LhsScalar* _lhs, Index lhsStride, const LhsScalar* _lhs, Index lhsStride,
const RhsScalar* _rhs, Index rhsStride, const RhsScalar* _rhs, Index rhsStride,
ResScalar* _res, Index resStride, ResScalar* _res, Index resIncr, Index resStride,
ResScalar alpha, ResScalar alpha,
level3_blocking<LhsScalar,RhsScalar>& blocking, level3_blocking<LhsScalar,RhsScalar>& blocking,
GemmParallelInfo<Index>* info = 0) GemmParallelInfo<Index>* info = 0)
{ {
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper; typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper; typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor,Unaligned,ResInnerStride> ResMapper;
LhsMapper lhs(_lhs,lhsStride); LhsMapper lhs(_lhs, lhsStride);
RhsMapper rhs(_rhs,rhsStride); RhsMapper rhs(_rhs, rhsStride);
ResMapper res(_res, resStride); ResMapper res(_res, resStride, resIncr);
Index kc = blocking.kc(); // cache block size along the K direction Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
@ -83,8 +85,8 @@ static void run(Index rows, Index cols, Index depth,
if(info) if(info)
{ {
// this is the parallel version! // this is the parallel version!
Index tid = omp_get_thread_num(); int tid = omp_get_thread_num();
Index threads = omp_get_num_threads(); int threads = omp_get_num_threads();
LhsScalar* blockA = blocking.blockA(); LhsScalar* blockA = blocking.blockA();
eigen_internal_assert(blockA!=0); eigen_internal_assert(blockA!=0);
@ -116,9 +118,9 @@ static void run(Index rows, Index cols, Index depth,
info[tid].sync = k; info[tid].sync = k;
// Computes C_i += A' * B' per A'_i // Computes C_i += A' * B' per A'_i
for(Index shift=0; shift<threads; ++shift) for(int shift=0; shift<threads; ++shift)
{ {
Index i = (tid+shift)%threads; int i = (tid+shift)%threads;
// At this point we have to make sure that A'_i has been updated by the thread i, // At this point we have to make sure that A'_i has been updated by the thread i,
// we use testAndSetOrdered to mimic a volatile access. // we use testAndSetOrdered to mimic a volatile access.
@ -226,7 +228,7 @@ struct gemm_functor
Gemm::run(rows, cols, m_lhs.cols(), Gemm::run(rows, cols, m_lhs.cols(),
&m_lhs.coeffRef(row,0), m_lhs.outerStride(), &m_lhs.coeffRef(row,0), m_lhs.outerStride(),
&m_rhs.coeffRef(0,col), m_rhs.outerStride(), &m_rhs.coeffRef(0,col), m_rhs.outerStride(),
(Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(), (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.innerStride(), m_dest.outerStride(),
m_actualAlpha, m_blocking, info); m_actualAlpha, m_blocking, info);
} }
@ -428,7 +430,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
lazyproduct::evalTo(dst, lhs, rhs); lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>());
else else
{ {
dst.setZero(); dst.setZero();
@ -440,7 +442,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
lazyproduct::addTo(dst, lhs, rhs); lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>());
else else
scaleAndAddTo(dst,lhs, rhs, Scalar(1)); scaleAndAddTo(dst,lhs, rhs, Scalar(1));
} }
@ -449,7 +451,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
lazyproduct::subTo(dst, lhs, rhs); lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>());
else else
scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
} }
@ -476,7 +478,8 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
Index, Index,
LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,
Dest::InnerStrideAtCompileTime>,
ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor; ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor;
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);

View File

@ -25,51 +25,54 @@ namespace internal {
**********************************************************************/ **********************************************************************/
// forward declarations (defined at the end of this file) // forward declarations (defined at the end of this file)
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo> template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo>
struct tribb_kernel; struct tribb_kernel;
/* Optimized matrix-matrix product evaluating only one triangular half */ /* Optimized matrix-matrix product evaluating only one triangular half */
template <typename Index, template <typename Index,
typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
int ResStorageOrder, int UpLo, int Version = Specialized> int ResStorageOrder, int ResInnerStride, int UpLo, int Version = Specialized>
struct general_matrix_matrix_triangular_product; struct general_matrix_matrix_triangular_product;
// as usual if the result is row major => we transpose the product // as usual if the result is row major => we transpose the product
template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int UpLo, int Version> typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,UpLo,Version> int ResInnerStride, int UpLo, int Version>
struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,UpLo,Version>
{ {
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resIncr, Index resStride,
const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking) const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking)
{ {
general_matrix_matrix_triangular_product<Index, general_matrix_matrix_triangular_product<Index,
RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs, RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs, LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
ColMajor, UpLo==Lower?Upper:Lower> ColMajor, ResInnerStride, UpLo==Lower?Upper:Lower>
::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking); ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking);
} }
}; };
template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int UpLo, int Version> typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Version> int ResInnerStride, int UpLo, int Version>
struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,UpLo,Version>
{ {
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const RhsScalar* _rhs, Index rhsStride,
ResScalar* _res, Index resIncr, Index resStride,
const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking) const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
{ {
typedef gebp_traits<LhsScalar,RhsScalar> Traits; typedef gebp_traits<LhsScalar,RhsScalar> Traits;
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper; typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper; typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
LhsMapper lhs(_lhs,lhsStride); LhsMapper lhs(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride); RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride); ResMapper res(_res, resStride, resIncr);
Index kc = blocking.kc(); Index kc = blocking.kc();
Index mc = (std::min)(size,blocking.mc()); Index mc = (std::min)(size,blocking.mc());
@ -87,7 +90,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb; tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, ResInnerStride, UpLo> sybb;
for(Index k2=0; k2<depth; k2+=kc) for(Index k2=0; k2<depth; k2+=kc)
{ {
@ -110,8 +113,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
(std::min)(size,i2), alpha, -1, -1, 0, 0); (std::min)(size,i2), alpha, -1, -1, 0, 0);
sybb(_res+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
if (UpLo==Upper) if (UpLo==Upper)
{ {
@ -133,7 +135,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
// while the triangular block overlapping the diagonal is evaluated into a // while the triangular block overlapping the diagonal is evaluated into a
// small temporary buffer which is then accumulated into the result using a // small temporary buffer which is then accumulated into the result using a
// triangular traversal. // triangular traversal.
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo> template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo>
struct tribb_kernel struct tribb_kernel
{ {
typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits; typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
@ -142,13 +144,15 @@ struct tribb_kernel
enum { enum {
BlockSize = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret BlockSize = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret
}; };
void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
{ {
typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper; typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
ResMapper res(_res, resStride); typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned> BufferMapper;
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel; ResMapper res(_res, resStride, resIncr);
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel1;
gebp_kernel<LhsScalar, RhsScalar, Index, BufferMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel2;
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer; Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer((internal::constructor_without_unaligned_array_assert()));
// let's process the block per panel of actual_mc x BlockSize, // let's process the block per panel of actual_mc x BlockSize,
// again, each is split into three parts, etc. // again, each is split into three parts, etc.
@ -158,31 +162,32 @@ struct tribb_kernel
const RhsScalar* actual_b = blockB+j*depth; const RhsScalar* actual_b = blockB+j*depth;
if(UpLo==Upper) if(UpLo==Upper)
gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, gebp_kernel1(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
-1, -1, 0, 0); -1, -1, 0, 0);
// selfadjoint micro block // selfadjoint micro block
{ {
Index i = j; Index i = j;
buffer.setZero(); buffer.setZero();
// 1 - apply the kernel on the temporary buffer // 1 - apply the kernel on the temporary buffer
gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, gebp_kernel2(BufferMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-1, -1, 0, 0); -1, -1, 0, 0);
// 2 - triangular accumulation // 2 - triangular accumulation
for(Index j1=0; j1<actualBlockSize; ++j1) for(Index j1=0; j1<actualBlockSize; ++j1)
{ {
ResScalar* r = &res(i, j + j1); typename ResMapper::LinearMapper r = res.getLinearMapper(i,j+j1);
for(Index i1=UpLo==Lower ? j1 : 0; for(Index i1=UpLo==Lower ? j1 : 0;
UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1) UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
r[i1] += buffer(i1,j1); r(i1) += buffer(i1,j1);
} }
} }
if(UpLo==Lower) if(UpLo==Lower)
{ {
Index i = j+actualBlockSize; Index i = j+actualBlockSize;
gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, gebp_kernel1(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i,
depth, actualBlockSize, alpha, -1, -1, 0, 0); depth, actualBlockSize, alpha, -1, -1, 0, 0);
} }
} }
} }
@ -269,10 +274,13 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
enum { enum {
IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0, IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0, LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0,
RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0 RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0,
SkipDiag = (UpLo&(UnitDiag|ZeroDiag))!=0
}; };
Index size = mat.cols(); Index size = mat.cols();
if(SkipDiag)
size--;
Index depth = actualLhs.cols(); Index depth = actualLhs.cols();
typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,typename Lhs::Scalar,typename Rhs::Scalar, typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,typename Lhs::Scalar,typename Rhs::Scalar,
@ -283,10 +291,12 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
internal::general_matrix_matrix_triangular_product<Index, internal::general_matrix_matrix_triangular_product<Index,
typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
IsRowMajor ? RowMajor : ColMajor, UpLo> IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo&(Lower|Upper)>
::run(size, depth, ::run(size, depth,
&actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(), &actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(),
mat.data(), mat.outerStride(), actualAlpha, blocking); &actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(),
mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? mat.innerStride() : mat.outerStride() ) : 0),
mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
} }
}; };
@ -294,6 +304,7 @@ template<typename MatrixType, unsigned int UpLo>
template<typename ProductType> template<typename ProductType>
TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
{ {
EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols()); eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta); general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);

View File

@ -40,7 +40,7 @@ namespace internal {
template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo> template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo>
struct general_matrix_matrix_rankupdate : struct general_matrix_matrix_rankupdate :
general_matrix_matrix_triangular_product< general_matrix_matrix_triangular_product<
Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {}; Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,1,UpLo,BuiltIn> {};
// try to go to BLAS specialization // try to go to BLAS specialization
@ -48,19 +48,19 @@ struct general_matrix_matrix_rankupdate :
template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \ template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs, int UpLo> \ int RhsStorageOrder, bool ConjugateRhs, int UpLo> \
struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \ struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \ Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,1,UpLo,Specialized> { \
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \ const Scalar* rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \
{ \ { \
if (lhs==rhs) { \ if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \
general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \ general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \
} else { \ } else { \
general_matrix_matrix_triangular_product<Index, \ general_matrix_matrix_triangular_product<Index, \
Scalar, LhsStorageOrder, ConjugateLhs, \ Scalar, LhsStorageOrder, ConjugateLhs, \
Scalar, RhsStorageOrder, ConjugateRhs, \ Scalar, RhsStorageOrder, ConjugateRhs, \
ColMajor, UpLo, BuiltIn> \ ColMajor, 1, UpLo, BuiltIn> \
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resIncr,resStride,alpha,blocking); \
} \ } \
} \ } \
}; };
@ -88,7 +88,7 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \ BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \ char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \
EIGTYPE beta(1); \ EIGTYPE beta(1); \
BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \ BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \
} \ } \
}; };
@ -125,9 +125,13 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
} \ } \
}; };
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk)
EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk)
#else
EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_) EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_) EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_)
#endif
// TODO hanlde complex cases // TODO hanlde complex cases
// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_) // EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)

View File

@ -46,25 +46,27 @@ namespace internal {
// gemm specialization // gemm specialization
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASPREFIX) \ #define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC) \
template< \ template< \
typename Index, \ typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \ int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \ int RhsStorageOrder, bool ConjugateRhs> \
struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \ struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1> \
{ \ { \
typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \ typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \
\ \
static void run(Index rows, Index cols, Index depth, \ static void run(Index rows, Index cols, Index depth, \
const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \ EIGTYPE* res, Index resIncr, Index resStride, \
EIGTYPE alpha, \ EIGTYPE alpha, \
level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \ level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
GemmParallelInfo<Index>* /*info = 0*/) \ GemmParallelInfo<Index>* /*info = 0*/) \
{ \ { \
using std::conj; \ using std::conj; \
\ \
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
eigen_assert(resIncr == 1); \
char transa, transb; \ char transa, transb; \
BlasIndex m, n, k, lda, ldb, ldc; \ BlasIndex m, n, k, lda, ldb, ldc; \
const EIGTYPE *a, *b; \ const EIGTYPE *a, *b; \
@ -100,13 +102,20 @@ static void run(Index rows, Index cols, Index depth, \
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} else b = _rhs; \ } else b = _rhs; \
\ \
BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
}}; }};
GEMM_SPECIALIZATION(double, d, double, d) #ifdef EIGEN_USE_MKL
GEMM_SPECIALIZATION(float, f, float, s) GEMM_SPECIALIZATION(double, d, double, dgemm)
GEMM_SPECIALIZATION(dcomplex, cd, double, z) GEMM_SPECIALIZATION(float, f, float, sgemm)
GEMM_SPECIALIZATION(scomplex, cf, float, c) GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm)
#else
GEMM_SPECIALIZATION(double, d, double, dgemm_)
GEMM_SPECIALIZATION(float, f, float, sgemm_)
GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
#endif
} // end namespase internal } // end namespase internal

View File

@ -183,8 +183,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
alignmentPattern = AllAligned; alignmentPattern = AllAligned;
} }
const Index offset1 = (FirstAligned && alignmentStep==1)?3:1; const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
const Index offset3 = (FirstAligned && alignmentStep==1)?1:3; const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce) for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
@ -457,8 +457,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,R
alignmentPattern = AllAligned; alignmentPattern = AllAligned;
} }
const Index offset1 = (FirstAligned && alignmentStep==1)?3:1; const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
const Index offset3 = (FirstAligned && alignmentStep==1)?1:3; const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)

View File

@ -85,7 +85,7 @@ EIGEN_BLAS_GEMV_SPECIALIZE(float)
EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex) EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex)
EIGEN_BLAS_GEMV_SPECIALIZE(scomplex) EIGEN_BLAS_GEMV_SPECIALIZE(scomplex)
#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \ #define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \
template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \ template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \ struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
{ \ { \
@ -113,14 +113,21 @@ static void run( \
x_ptr=x_tmp.data(); \ x_ptr=x_tmp.data(); \
incx=1; \ incx=1; \
} else x_ptr=rhs; \ } else x_ptr=rhs; \
BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
}\ }\
}; };
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d) #ifdef EIGEN_USE_MKL
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s) EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv)
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z) EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv)
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c) EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv)
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv)
#else
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_)
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_)
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_)
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_)
#endif
} // end namespase internal } // end namespase internal

View File

@ -17,7 +17,8 @@ namespace internal {
/** \internal */ /** \internal */
inline void manage_multi_threading(Action action, int* v) inline void manage_multi_threading(Action action, int* v)
{ {
static EIGEN_UNUSED int m_maxThreads = -1; static int m_maxThreads = -1;
EIGEN_UNUSED_VARIABLE(m_maxThreads);
if(action==SetAction) if(action==SetAction)
{ {
@ -75,7 +76,7 @@ template<typename Index> struct GemmParallelInfo
{ {
GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {} GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
int volatile sync; Index volatile sync;
int volatile users; int volatile users;
Index lhs_start; Index lhs_start;
@ -104,13 +105,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
// - the sizes are large enough // - the sizes are large enough
// compute the maximal number of threads from the size of the product: // compute the maximal number of threads from the size of the product:
// FIXME this has to be fine tuned // This first heuristic takes into account that the product kernel is fully optimized when working with nr columns at once.
Index size = transpose ? rows : cols; Index size = transpose ? rows : cols;
Index pb_max_threads = std::max<Index>(1,size / 32); Index pb_max_threads = std::max<Index>(1,size / Functor::Traits::nr);
// compute the maximal number of threads from the total amount of work: // compute the maximal number of threads from the total amount of work:
double work = static_cast<double>(rows) * static_cast<double>(cols) * double work = static_cast<double>(rows) * static_cast<double>(cols) *
static_cast<double>(depth); static_cast<double>(depth);
double kMinTaskSize = 50000; // Heuristic. double kMinTaskSize = 50000; // FIXME improve this heuristic.
pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, work / kMinTaskSize)); pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, work / kMinTaskSize));
// compute the number of threads we are going to use // compute the number of threads we are going to use
@ -149,8 +151,10 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
info[i].lhs_start = r0; info[i].lhs_start = r0;
info[i].lhs_length = actualBlockRows; info[i].lhs_length = actualBlockRows;
if(transpose) func(c0, actualBlockCols, 0, rows, info); if(transpose)
else func(0, rows, c0, actualBlockCols, info); func(c0, actualBlockCols, 0, rows, info);
else
func(0, rows, c0, actualBlockCols, info);
} }
#endif #endif
} }

View File

@ -277,20 +277,21 @@ struct symm_pack_rhs
template <typename Scalar, typename Index, template <typename Scalar, typename Index,
int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs, int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs, int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
int ResStorageOrder> int ResStorageOrder, int ResInnerStride>
struct product_selfadjoint_matrix; struct product_selfadjoint_matrix;
template <typename Scalar, typename Index, template <typename Scalar, typename Index,
int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs, int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs> int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor> int ResInnerStride>
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor,ResInnerStride>
{ {
static EIGEN_STRONG_INLINE void run( static EIGEN_STRONG_INLINE void run(
Index rows, Index cols, Index rows, Index cols,
const Scalar* lhs, Index lhsStride, const Scalar* lhs, Index lhsStride,
const Scalar* rhs, Index rhsStride, const Scalar* rhs, Index rhsStride,
Scalar* res, Index resStride, Scalar* res, Index resIncr, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{ {
product_selfadjoint_matrix<Scalar, Index, product_selfadjoint_matrix<Scalar, Index,
@ -298,33 +299,35 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co
RhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs), RhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs),
EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor, EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs), LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs),
ColMajor> ColMajor,ResInnerStride>
::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking);
} }
}; };
template <typename Scalar, typename Index, template <typename Scalar, typename Index,
int LhsStorageOrder, bool ConjugateLhs, int LhsStorageOrder, bool ConjugateLhs,
int RhsStorageOrder, bool ConjugateRhs> int RhsStorageOrder, bool ConjugateRhs,
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor> int ResInnerStride>
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>
{ {
static EIGEN_DONT_INLINE void run( static EIGEN_DONT_INLINE void run(
Index rows, Index cols, Index rows, Index cols,
const Scalar* _lhs, Index lhsStride, const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride, const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride, Scalar* res, Index resIncr, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
}; };
template <typename Scalar, typename Index, template <typename Scalar, typename Index,
int LhsStorageOrder, bool ConjugateLhs, int LhsStorageOrder, bool ConjugateLhs,
int RhsStorageOrder, bool ConjugateRhs> int RhsStorageOrder, bool ConjugateRhs,
EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>::run( int ResInnerStride>
EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>::run(
Index rows, Index cols, Index rows, Index cols,
const Scalar* _lhs, Index lhsStride, const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride, const Scalar* _rhs, Index rhsStride,
Scalar* _res, Index resStride, Scalar* _res, Index resIncr, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{ {
Index size = rows; Index size = rows;
@ -334,11 +337,11 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper; typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
LhsMapper lhs(_lhs,lhsStride); LhsMapper lhs(_lhs,lhsStride);
LhsTransposeMapper lhs_transpose(_lhs,lhsStride); LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride); RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride); ResMapper res(_res, resStride, resIncr);
Index kc = blocking.kc(); // cache block size along the K direction Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
@ -398,26 +401,28 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
// matrix * selfadjoint product // matrix * selfadjoint product
template <typename Scalar, typename Index, template <typename Scalar, typename Index,
int LhsStorageOrder, bool ConjugateLhs, int LhsStorageOrder, bool ConjugateLhs,
int RhsStorageOrder, bool ConjugateRhs> int RhsStorageOrder, bool ConjugateRhs,
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor> int ResInnerStride>
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>
{ {
static EIGEN_DONT_INLINE void run( static EIGEN_DONT_INLINE void run(
Index rows, Index cols, Index rows, Index cols,
const Scalar* _lhs, Index lhsStride, const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride, const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride, Scalar* res, Index resIncr, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
}; };
template <typename Scalar, typename Index, template <typename Scalar, typename Index,
int LhsStorageOrder, bool ConjugateLhs, int LhsStorageOrder, bool ConjugateLhs,
int RhsStorageOrder, bool ConjugateRhs> int RhsStorageOrder, bool ConjugateRhs,
EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>::run( int ResInnerStride>
EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>::run(
Index rows, Index cols, Index rows, Index cols,
const Scalar* _lhs, Index lhsStride, const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride, const Scalar* _rhs, Index rhsStride,
Scalar* _res, Index resStride, Scalar* _res, Index resIncr, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{ {
Index size = cols; Index size = cols;
@ -425,9 +430,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
typedef gebp_traits<Scalar,Scalar> Traits; typedef gebp_traits<Scalar,Scalar> Traits;
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
LhsMapper lhs(_lhs,lhsStride); LhsMapper lhs(_lhs,lhsStride);
ResMapper res(_res,resStride); ResMapper res(_res,resStride, resIncr);
Index kc = blocking.kc(); // cache block size along the K direction Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
@ -503,12 +508,13 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false>
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)), NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor> internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor,
Dest::InnerStrideAtCompileTime>
::run( ::run(
lhs.rows(), rhs.cols(), // sizes lhs.rows(), rhs.cols(), // sizes
&lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
&rhs.coeffRef(0,0), rhs.outerStride(), // rhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
&dst.coeffRef(0,0), dst.outerStride(), // result info &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info
actualAlpha, blocking // alpha actualAlpha, blocking // alpha
); );
} }

View File

@ -40,20 +40,22 @@ namespace internal {
/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */ /* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ #define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, \ template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \ int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \ int RhsStorageOrder, bool ConjugateRhs> \
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \
{\ {\
\ \
static void run( \ static void run( \
Index rows, Index cols, \ Index rows, Index cols, \
const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \ EIGTYPE* res, Index resIncr, Index resStride, \
EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
{ \ { \
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
eigen_assert(resIncr == 1); \
char side='L', uplo='L'; \ char side='L', uplo='L'; \
BlasIndex m, n, lda, ldb, ldc; \ BlasIndex m, n, lda, ldb, ldc; \
const EIGTYPE *a, *b; \ const EIGTYPE *a, *b; \
@ -81,25 +83,27 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} else b = _rhs; \ } else b = _rhs; \
\ \
BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
\ \
} \ } \
}; };
#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ #define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, \ template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \ int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \ int RhsStorageOrder, bool ConjugateRhs> \
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \
{\ {\
static void run( \ static void run( \
Index rows, Index cols, \ Index rows, Index cols, \
const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \ EIGTYPE* res, Index resIncr, Index resStride, \
EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
{ \ { \
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
eigen_assert(resIncr == 1); \
char side='L', uplo='L'; \ char side='L', uplo='L'; \
BlasIndex m, n, lda, ldb, ldc; \ BlasIndex m, n, lda, ldb, ldc; \
const EIGTYPE *a, *b; \ const EIGTYPE *a, *b; \
@ -144,33 +148,41 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} \ } \
\ \
BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
\ \
} \ } \
}; };
EIGEN_BLAS_SYMM_L(double, double, d, d) #ifdef EIGEN_USE_MKL
EIGEN_BLAS_SYMM_L(float, float, f, s) EIGEN_BLAS_SYMM_L(double, double, d, dsymm)
EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z) EIGEN_BLAS_SYMM_L(float, float, f, ssymm)
EIGEN_BLAS_HEMM_L(scomplex, float, cf, c) EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm)
EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm)
#else
EIGEN_BLAS_SYMM_L(double, double, d, dsymm_)
EIGEN_BLAS_SYMM_L(float, float, f, ssymm_)
EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_)
EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_)
#endif
/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */ /* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ #define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, \ template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \ int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \ int RhsStorageOrder, bool ConjugateRhs> \
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \
{\ {\
\ \
static void run( \ static void run( \
Index rows, Index cols, \ Index rows, Index cols, \
const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \ EIGTYPE* res, Index resIncr, Index resStride, \
EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
{ \ { \
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
eigen_assert(resIncr == 1); \
char side='R', uplo='L'; \ char side='R', uplo='L'; \
BlasIndex m, n, lda, ldb, ldc; \ BlasIndex m, n, lda, ldb, ldc; \
const EIGTYPE *a, *b; \ const EIGTYPE *a, *b; \
@ -197,25 +209,27 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} else b = _lhs; \ } else b = _lhs; \
\ \
BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
\ \
} \ } \
}; };
#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ #define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, \ template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \ int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \ int RhsStorageOrder, bool ConjugateRhs> \
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \
{\ {\
static void run( \ static void run( \
Index rows, Index cols, \ Index rows, Index cols, \
const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \ EIGTYPE* res, Index resIncr, Index resStride, \
EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
{ \ { \
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
eigen_assert(resIncr == 1); \
char side='R', uplo='L'; \ char side='R', uplo='L'; \
BlasIndex m, n, lda, ldb, ldc; \ BlasIndex m, n, lda, ldb, ldc; \
const EIGTYPE *a, *b; \ const EIGTYPE *a, *b; \
@ -259,15 +273,21 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} \ } \
\ \
BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
} \ } \
}; };
EIGEN_BLAS_SYMM_R(double, double, d, d) #ifdef EIGEN_USE_MKL
EIGEN_BLAS_SYMM_R(float, float, f, s) EIGEN_BLAS_SYMM_R(double, double, d, dsymm)
EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z) EIGEN_BLAS_SYMM_R(float, float, f, ssymm)
EIGEN_BLAS_HEMM_R(scomplex, float, cf, c) EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm)
EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm)
#else
EIGEN_BLAS_SYMM_R(double, double, d, dsymm_)
EIGEN_BLAS_SYMM_R(float, float, f, ssymm_)
EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_)
EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_)
#endif
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen

View File

@ -83,10 +83,10 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
Scalar t3(0); Scalar t3(0);
Packet ptmp3 = pset1<Packet>(t3); Packet ptmp3 = pset1<Packet>(t3);
size_t starti = FirstTriangular ? 0 : j+2; Index starti = FirstTriangular ? 0 : j+2;
size_t endi = FirstTriangular ? j : size; Index endi = FirstTriangular ? j : size;
size_t alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti); Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti);
size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize); Index alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
res[j] += cjd.pmul(numext::real(A0[j]), t0); res[j] += cjd.pmul(numext::real(A0[j]), t0);
res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1); res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1);
@ -101,7 +101,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
t2 += cj1.pmul(A0[j+1], rhs[j+1]); t2 += cj1.pmul(A0[j+1], rhs[j+1]);
} }
for (size_t i=starti; i<alignedStart; ++i) for (Index i=starti; i<alignedStart; ++i)
{ {
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1); res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
t2 += cj1.pmul(A0[i], rhs[i]); t2 += cj1.pmul(A0[i], rhs[i]);
@ -113,7 +113,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart; const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart; const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
Scalar* EIGEN_RESTRICT resIt = res + alignedStart; Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize) for (Index i=alignedStart; i<alignedEnd; i+=PacketSize)
{ {
Packet A0i = ploadu<Packet>(a0It); a0It += PacketSize; Packet A0i = ploadu<Packet>(a0It); a0It += PacketSize;
Packet A1i = ploadu<Packet>(a1It); a1It += PacketSize; Packet A1i = ploadu<Packet>(a1It); a1It += PacketSize;
@ -125,7 +125,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3); ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
pstore(resIt,Xi); resIt += PacketSize; pstore(resIt,Xi); resIt += PacketSize;
} }
for (size_t i=alignedEnd; i<endi; i++) for (Index i=alignedEnd; i<endi; i++)
{ {
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1); res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
t2 += cj1.pmul(A0[i], rhs[i]); t2 += cj1.pmul(A0[i], rhs[i]);

View File

@ -95,14 +95,21 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \
x_tmp=map_x.conjugate(); \ x_tmp=map_x.conjugate(); \
x_ptr=x_tmp.data(); \ x_ptr=x_tmp.data(); \
} else x_ptr=_rhs; \ } else x_ptr=_rhs; \
BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
}\ }\
}; };
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv)
EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv)
EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv)
#else
EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_) EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_)
EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_) EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_)
EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_) EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_)
EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_) EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_)
#endif
} // end namespace internal } // end namespace internal

View File

@ -109,10 +109,10 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
internal::general_matrix_matrix_triangular_product<Index, internal::general_matrix_matrix_triangular_product<Index,
Scalar, OtherIsRowMajor ? RowMajor : ColMajor, OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex, Scalar, OtherIsRowMajor ? RowMajor : ColMajor, OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex, Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
IsRowMajor ? RowMajor : ColMajor, UpLo> IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo>
::run(size, depth, ::run(size, depth,
&actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(),
mat.data(), mat.outerStride(), actualAlpha, blocking); mat.data(), mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
} }
}; };

View File

@ -45,22 +45,24 @@ template <typename Scalar, typename Index,
int Mode, bool LhsIsTriangular, int Mode, bool LhsIsTriangular,
int LhsStorageOrder, bool ConjugateLhs, int LhsStorageOrder, bool ConjugateLhs,
int RhsStorageOrder, bool ConjugateRhs, int RhsStorageOrder, bool ConjugateRhs,
int ResStorageOrder, int Version = Specialized> int ResStorageOrder, int ResInnerStride,
int Version = Specialized>
struct product_triangular_matrix_matrix; struct product_triangular_matrix_matrix;
template <typename Scalar, typename Index, template <typename Scalar, typename Index,
int Mode, bool LhsIsTriangular, int Mode, bool LhsIsTriangular,
int LhsStorageOrder, bool ConjugateLhs, int LhsStorageOrder, bool ConjugateLhs,
int RhsStorageOrder, bool ConjugateRhs, int Version> int RhsStorageOrder, bool ConjugateRhs,
int ResInnerStride, int Version>
struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular, struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
LhsStorageOrder,ConjugateLhs, LhsStorageOrder,ConjugateLhs,
RhsStorageOrder,ConjugateRhs,RowMajor,Version> RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,Version>
{ {
static EIGEN_STRONG_INLINE void run( static EIGEN_STRONG_INLINE void run(
Index rows, Index cols, Index depth, Index rows, Index cols, Index depth,
const Scalar* lhs, Index lhsStride, const Scalar* lhs, Index lhsStride,
const Scalar* rhs, Index rhsStride, const Scalar* rhs, Index rhsStride,
Scalar* res, Index resStride, Scalar* res, Index resIncr, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{ {
product_triangular_matrix_matrix<Scalar, Index, product_triangular_matrix_matrix<Scalar, Index,
@ -70,18 +72,19 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
ConjugateRhs, ConjugateRhs,
LhsStorageOrder==RowMajor ? ColMajor : RowMajor, LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
ConjugateLhs, ConjugateLhs,
ColMajor> ColMajor, ResInnerStride>
::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking);
} }
}; };
// implements col-major += alpha * op(triangular) * op(general) // implements col-major += alpha * op(triangular) * op(general)
template <typename Scalar, typename Index, int Mode, template <typename Scalar, typename Index, int Mode,
int LhsStorageOrder, bool ConjugateLhs, int LhsStorageOrder, bool ConjugateLhs,
int RhsStorageOrder, bool ConjugateRhs, int Version> int RhsStorageOrder, bool ConjugateRhs,
int ResInnerStride, int Version>
struct product_triangular_matrix_matrix<Scalar,Index,Mode,true, struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
LhsStorageOrder,ConjugateLhs, LhsStorageOrder,ConjugateLhs,
RhsStorageOrder,ConjugateRhs,ColMajor,Version> RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>
{ {
typedef gebp_traits<Scalar,Scalar> Traits; typedef gebp_traits<Scalar,Scalar> Traits;
@ -95,20 +98,21 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
Index _rows, Index _cols, Index _depth, Index _rows, Index _cols, Index _depth,
const Scalar* _lhs, Index lhsStride, const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride, const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride, Scalar* res, Index resIncr, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
}; };
template <typename Scalar, typename Index, int Mode, template <typename Scalar, typename Index, int Mode,
int LhsStorageOrder, bool ConjugateLhs, int LhsStorageOrder, bool ConjugateLhs,
int RhsStorageOrder, bool ConjugateRhs, int Version> int RhsStorageOrder, bool ConjugateRhs,
int ResInnerStride, int Version>
EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
LhsStorageOrder,ConjugateLhs, LhsStorageOrder,ConjugateLhs,
RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run( RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
Index _rows, Index _cols, Index _depth, Index _rows, Index _cols, Index _depth,
const Scalar* _lhs, Index lhsStride, const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride, const Scalar* _rhs, Index rhsStride,
Scalar* _res, Index resStride, Scalar* _res, Index resIncr, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{ {
// strip zeros // strip zeros
@ -119,10 +123,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
LhsMapper lhs(_lhs,lhsStride); LhsMapper lhs(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride); RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride); ResMapper res(_res, resStride, resIncr);
Index kc = blocking.kc(); // cache block size along the K direction Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
@ -137,7 +141,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer; // To work around an "error: member reference base type 'Matrix<...>
// (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is
// not a structure or union" compilation error in nvcc (tested V8.0.61),
// create a dummy internal::constructor_without_unaligned_array_assert
// object to pass to the Matrix constructor.
internal::constructor_without_unaligned_array_assert a;
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer(a);
triangularBuffer.setZero(); triangularBuffer.setZero();
if((Mode&ZeroDiag)==ZeroDiag) if((Mode&ZeroDiag)==ZeroDiag)
triangularBuffer.diagonal().setZero(); triangularBuffer.diagonal().setZero();
@ -229,10 +239,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
// implements col-major += alpha * op(general) * op(triangular) // implements col-major += alpha * op(general) * op(triangular)
template <typename Scalar, typename Index, int Mode, template <typename Scalar, typename Index, int Mode,
int LhsStorageOrder, bool ConjugateLhs, int LhsStorageOrder, bool ConjugateLhs,
int RhsStorageOrder, bool ConjugateRhs, int Version> int RhsStorageOrder, bool ConjugateRhs,
int ResInnerStride, int Version>
struct product_triangular_matrix_matrix<Scalar,Index,Mode,false, struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
LhsStorageOrder,ConjugateLhs, LhsStorageOrder,ConjugateLhs,
RhsStorageOrder,ConjugateRhs,ColMajor,Version> RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>
{ {
typedef gebp_traits<Scalar,Scalar> Traits; typedef gebp_traits<Scalar,Scalar> Traits;
enum { enum {
@ -245,20 +256,21 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index _rows, Index _cols, Index _depth, Index _rows, Index _cols, Index _depth,
const Scalar* _lhs, Index lhsStride, const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride, const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride, Scalar* res, Index resIncr, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
}; };
template <typename Scalar, typename Index, int Mode, template <typename Scalar, typename Index, int Mode,
int LhsStorageOrder, bool ConjugateLhs, int LhsStorageOrder, bool ConjugateLhs,
int RhsStorageOrder, bool ConjugateRhs, int Version> int RhsStorageOrder, bool ConjugateRhs,
int ResInnerStride, int Version>
EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
LhsStorageOrder,ConjugateLhs, LhsStorageOrder,ConjugateLhs,
RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run( RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
Index _rows, Index _cols, Index _depth, Index _rows, Index _cols, Index _depth,
const Scalar* _lhs, Index lhsStride, const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride, const Scalar* _rhs, Index rhsStride,
Scalar* _res, Index resStride, Scalar* _res, Index resIncr, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{ {
const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar); const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar);
@ -270,10 +282,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
LhsMapper lhs(_lhs,lhsStride); LhsMapper lhs(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride); RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride); ResMapper res(_res, resStride, resIncr);
Index kc = blocking.kc(); // cache block size along the K direction Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
@ -284,7 +296,8 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer; internal::constructor_without_unaligned_array_assert a;
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer(a);
triangularBuffer.setZero(); triangularBuffer.setZero();
if((Mode&ZeroDiag)==ZeroDiag) if((Mode&ZeroDiag)==ZeroDiag)
triangularBuffer.diagonal().setZero(); triangularBuffer.diagonal().setZero();
@ -393,7 +406,9 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
{ {
template<typename Dest> static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha) template<typename Dest> static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha)
{ {
typedef typename Dest::Scalar Scalar; typedef typename Lhs::Scalar LhsScalar;
typedef typename Rhs::Scalar RhsScalar;
typedef typename Dest::Scalar Scalar;
typedef internal::blas_traits<Lhs> LhsBlasTraits; typedef internal::blas_traits<Lhs> LhsBlasTraits;
typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
@ -405,8 +420,9 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs); typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs);
* RhsBlasTraits::extractScalarFactor(a_rhs); RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs);
Scalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType; Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType;
@ -423,14 +439,29 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
Mode, LhsIsTriangular, Mode, LhsIsTriangular,
(internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
(internal::traits<ActualRhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, (internal::traits<ActualRhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
(internal::traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor> (internal::traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor, Dest::InnerStrideAtCompileTime>
::run( ::run(
stripedRows, stripedCols, stripedDepth, // sizes stripedRows, stripedCols, stripedDepth, // sizes
&lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
&rhs.coeffRef(0,0), rhs.outerStride(), // rhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
&dst.coeffRef(0,0), dst.outerStride(), // result info &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info
actualAlpha, blocking actualAlpha, blocking
); );
// Apply correction if the diagonal is unit and a scalar factor was nested:
if ((Mode&UnitDiag)==UnitDiag)
{
if (LhsIsTriangular && lhs_alpha!=LhsScalar(1))
{
Index diagSize = (std::min)(lhs.rows(),lhs.cols());
dst.topRows(diagSize) -= ((lhs_alpha-LhsScalar(1))*a_rhs).topRows(diagSize);
}
else if ((!LhsIsTriangular) && rhs_alpha!=RhsScalar(1))
{
Index diagSize = (std::min)(rhs.rows(),rhs.cols());
dst.leftCols(diagSize) -= (rhs_alpha-RhsScalar(1))*a_lhs.leftCols(diagSize);
}
}
} }
}; };

View File

@ -46,7 +46,7 @@ template <typename Scalar, typename Index,
struct product_triangular_matrix_matrix_trmm : struct product_triangular_matrix_matrix_trmm :
product_triangular_matrix_matrix<Scalar,Index,Mode, product_triangular_matrix_matrix<Scalar,Index,Mode,
LhsIsTriangular,LhsStorageOrder,ConjugateLhs, LhsIsTriangular,LhsStorageOrder,ConjugateLhs,
RhsStorageOrder, ConjugateRhs, ResStorageOrder, BuiltIn> {}; RhsStorageOrder, ConjugateRhs, ResStorageOrder, 1, BuiltIn> {};
// try to go to BLAS specialization // try to go to BLAS specialization
@ -55,13 +55,15 @@ template <typename Index, int Mode, \
int LhsStorageOrder, bool ConjugateLhs, \ int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \ int RhsStorageOrder, bool ConjugateRhs> \
struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \ struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \
LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,Specialized> { \ LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,1,Specialized> { \
static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\ static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\
const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \ const Scalar* _rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
eigen_assert(resIncr == 1); \
product_triangular_matrix_matrix_trmm<Scalar,Index,Mode, \ product_triangular_matrix_matrix_trmm<Scalar,Index,Mode, \
LhsIsTriangular,LhsStorageOrder,ConjugateLhs, \ LhsIsTriangular,LhsStorageOrder,ConjugateLhs, \
RhsStorageOrder, ConjugateRhs, ColMajor>::run( \ RhsStorageOrder, ConjugateRhs, ColMajor>::run( \
_rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
} \ } \
}; };
@ -75,7 +77,7 @@ EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true)
EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false) EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false)
// implements col-major += alpha * op(triangular) * op(general) // implements col-major += alpha * op(triangular) * op(general)
#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ #define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, int Mode, \ template <typename Index, int Mode, \
int LhsStorageOrder, bool ConjugateLhs, \ int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \ int RhsStorageOrder, bool ConjugateRhs> \
@ -115,8 +117,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \ if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \
/* Most likely no benefit to call TRMM or GEMM from BLAS */ \ /* Most likely no benefit to call TRMM or GEMM from BLAS */ \
product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \ product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \
LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \ LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \
_rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \
/*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \ /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \
} else { \ } else { \
/* Make sense to call GEMM */ \ /* Make sense to call GEMM */ \
@ -124,8 +126,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \ MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \ BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \ gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \ general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \
rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \ rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, 1, resStride, alpha, gemm_blocking, 0); \
\ \
/*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
} \ } \
@ -172,7 +174,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
} \ } \
/*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \ /*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \
/* call ?trmm*/ \ /* call ?trmm*/ \
BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
\ \
/* Add op(a_triangular)*b into res*/ \ /* Add op(a_triangular)*b into res*/ \
Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
@ -180,13 +182,20 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
} \ } \
}; };
EIGEN_BLAS_TRMM_L(double, double, d, d) #ifdef EIGEN_USE_MKL
EIGEN_BLAS_TRMM_L(dcomplex, double, cd, z) EIGEN_BLAS_TRMM_L(double, double, d, dtrmm)
EIGEN_BLAS_TRMM_L(float, float, f, s) EIGEN_BLAS_TRMM_L(dcomplex, MKL_Complex16, cd, ztrmm)
EIGEN_BLAS_TRMM_L(scomplex, float, cf, c) EIGEN_BLAS_TRMM_L(float, float, f, strmm)
EIGEN_BLAS_TRMM_L(scomplex, MKL_Complex8, cf, ctrmm)
#else
EIGEN_BLAS_TRMM_L(double, double, d, dtrmm_)
EIGEN_BLAS_TRMM_L(dcomplex, double, cd, ztrmm_)
EIGEN_BLAS_TRMM_L(float, float, f, strmm_)
EIGEN_BLAS_TRMM_L(scomplex, float, cf, ctrmm_)
#endif
// implements col-major += alpha * op(general) * op(triangular) // implements col-major += alpha * op(general) * op(triangular)
#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ #define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, int Mode, \ template <typename Index, int Mode, \
int LhsStorageOrder, bool ConjugateLhs, \ int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \ int RhsStorageOrder, bool ConjugateRhs> \
@ -225,8 +234,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \ if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \
/* Most likely no benefit to call TRMM or GEMM from BLAS*/ \ /* Most likely no benefit to call TRMM or GEMM from BLAS*/ \
product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \ product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \
LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \ LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \
_rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \
/*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \ /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \
} else { \ } else { \
/* Make sense to call GEMM */ \ /* Make sense to call GEMM */ \
@ -234,8 +243,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \ MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \ BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \ gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \ general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \
rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \ rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, 1, resStride, alpha, gemm_blocking, 0); \
\ \
/*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
} \ } \
@ -282,7 +291,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
} \ } \
/*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \ /*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \
/* call ?trmm*/ \ /* call ?trmm*/ \
BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
\ \
/* Add op(a_triangular)*b into res*/ \ /* Add op(a_triangular)*b into res*/ \
Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
@ -290,11 +299,17 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
} \ } \
}; };
EIGEN_BLAS_TRMM_R(double, double, d, d) #ifdef EIGEN_USE_MKL
EIGEN_BLAS_TRMM_R(dcomplex, double, cd, z) EIGEN_BLAS_TRMM_R(double, double, d, dtrmm)
EIGEN_BLAS_TRMM_R(float, float, f, s) EIGEN_BLAS_TRMM_R(dcomplex, MKL_Complex16, cd, ztrmm)
EIGEN_BLAS_TRMM_R(scomplex, float, cf, c) EIGEN_BLAS_TRMM_R(float, float, f, strmm)
EIGEN_BLAS_TRMM_R(scomplex, MKL_Complex8, cf, ctrmm)
#else
EIGEN_BLAS_TRMM_R(double, double, d, dtrmm_)
EIGEN_BLAS_TRMM_R(dcomplex, double, cd, ztrmm_)
EIGEN_BLAS_TRMM_R(float, float, f, strmm_)
EIGEN_BLAS_TRMM_R(scomplex, float, cf, ctrmm_)
#endif
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen

View File

@ -221,8 +221,9 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs); typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs); typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs);
* RhsBlasTraits::extractScalarFactor(rhs); RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
enum { enum {
// FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1 // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
@ -274,6 +275,12 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
else else
dest = MappedDest(actualDestPtr, dest.size()); dest = MappedDest(actualDestPtr, dest.size());
} }
if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) )
{
Index diagSize = (std::min)(lhs.rows(),lhs.cols());
dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize);
}
} }
}; };
@ -295,8 +302,9 @@ template<int Mode> struct trmv_selector<Mode,RowMajor>
typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs); typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs); typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs);
* RhsBlasTraits::extractScalarFactor(rhs); RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
enum { enum {
DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
@ -326,6 +334,12 @@ template<int Mode> struct trmv_selector<Mode,RowMajor>
actualRhsPtr,1, actualRhsPtr,1,
dest.data(),dest.innerStride(), dest.data(),dest.innerStride(),
actualAlpha); actualAlpha);
if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) )
{
Index diagSize = (std::min)(lhs.rows(),lhs.cols());
dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize);
}
} }
}; };

View File

@ -71,7 +71,7 @@ EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex)
EIGEN_BLAS_TRMV_SPECIALIZE(scomplex) EIGEN_BLAS_TRMV_SPECIALIZE(scomplex)
// implements col-major: res += alpha * op(triangular) * vector // implements col-major: res += alpha * op(triangular) * vector
#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ #define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
enum { \ enum { \
@ -121,10 +121,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
diag = IsUnitDiag ? 'U' : 'N'; \ diag = IsUnitDiag ? 'U' : 'N'; \
\ \
/* call ?TRMV*/ \ /* call ?TRMV*/ \
BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
\ \
/* Add op(a_tr)rhs into res*/ \ /* Add op(a_tr)rhs into res*/ \
BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \ /* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
if (size<(std::max)(rows,cols)) { \ if (size<(std::max)(rows,cols)) { \
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
@ -142,18 +142,25 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
m = convert_index<BlasIndex>(size); \ m = convert_index<BlasIndex>(size); \
n = convert_index<BlasIndex>(cols-size); \ n = convert_index<BlasIndex>(cols-size); \
} \ } \
BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
} \ } \
} \ } \
}; };
EIGEN_BLAS_TRMV_CM(double, double, d, d) #ifdef EIGEN_USE_MKL
EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z) EIGEN_BLAS_TRMV_CM(double, double, d, d,)
EIGEN_BLAS_TRMV_CM(float, float, f, s) EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,)
EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c) EIGEN_BLAS_TRMV_CM(float, float, f, s,)
EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c,)
#else
EIGEN_BLAS_TRMV_CM(double, double, d, d, _)
EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _)
EIGEN_BLAS_TRMV_CM(float, float, f, s, _)
EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _)
#endif
// implements row-major: res += alpha * op(triangular) * vector // implements row-major: res += alpha * op(triangular) * vector
#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ #define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
enum { \ enum { \
@ -203,10 +210,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
diag = IsUnitDiag ? 'U' : 'N'; \ diag = IsUnitDiag ? 'U' : 'N'; \
\ \
/* call ?TRMV*/ \ /* call ?TRMV*/ \
BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
\ \
/* Add op(a_tr)rhs into res*/ \ /* Add op(a_tr)rhs into res*/ \
BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \ /* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
if (size<(std::max)(rows,cols)) { \ if (size<(std::max)(rows,cols)) { \
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
@ -224,15 +231,22 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
m = convert_index<BlasIndex>(size); \ m = convert_index<BlasIndex>(size); \
n = convert_index<BlasIndex>(cols-size); \ n = convert_index<BlasIndex>(cols-size); \
} \ } \
BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
} \ } \
} \ } \
}; };
EIGEN_BLAS_TRMV_RM(double, double, d, d) #ifdef EIGEN_USE_MKL
EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z) EIGEN_BLAS_TRMV_RM(double, double, d, d,)
EIGEN_BLAS_TRMV_RM(float, float, f, s) EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,)
EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c) EIGEN_BLAS_TRMV_RM(float, float, f, s,)
EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c,)
#else
EIGEN_BLAS_TRMV_RM(double, double, d, d,_)
EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_)
EIGEN_BLAS_TRMV_RM(float, float, f, s,_)
EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c,_)
#endif
} // end namespase internal } // end namespase internal

View File

@ -15,48 +15,48 @@ namespace Eigen {
namespace internal { namespace internal {
// if the rhs is row major, let's transpose the product // if the rhs is row major, let's transpose the product
template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder> template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor> struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor,OtherInnerStride>
{ {
static void run( static void run(
Index size, Index cols, Index size, Index cols,
const Scalar* tri, Index triStride, const Scalar* tri, Index triStride,
Scalar* _other, Index otherStride, Scalar* _other, Index otherIncr, Index otherStride,
level3_blocking<Scalar,Scalar>& blocking) level3_blocking<Scalar,Scalar>& blocking)
{ {
triangular_solve_matrix< triangular_solve_matrix<
Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft, Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft,
(Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper), (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper),
NumTraits<Scalar>::IsComplex && Conjugate, NumTraits<Scalar>::IsComplex && Conjugate,
TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor> TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor, OtherInnerStride>
::run(size, cols, tri, triStride, _other, otherStride, blocking); ::run(size, cols, tri, triStride, _other, otherIncr, otherStride, blocking);
} }
}; };
/* Optimized triangular solver with multiple right hand side and the triangular matrix on the left /* Optimized triangular solver with multiple right hand side and the triangular matrix on the left
*/ */
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder> template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder,int OtherInnerStride>
struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>
{ {
static EIGEN_DONT_INLINE void run( static EIGEN_DONT_INLINE void run(
Index size, Index otherSize, Index size, Index otherSize,
const Scalar* _tri, Index triStride, const Scalar* _tri, Index triStride,
Scalar* _other, Index otherStride, Scalar* _other, Index otherIncr, Index otherStride,
level3_blocking<Scalar,Scalar>& blocking); level3_blocking<Scalar,Scalar>& blocking);
}; };
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder> template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>::run( EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run(
Index size, Index otherSize, Index size, Index otherSize,
const Scalar* _tri, Index triStride, const Scalar* _tri, Index triStride,
Scalar* _other, Index otherStride, Scalar* _other, Index otherIncr, Index otherStride,
level3_blocking<Scalar,Scalar>& blocking) level3_blocking<Scalar,Scalar>& blocking)
{ {
Index cols = otherSize; Index cols = otherSize;
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper; typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper; typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> OtherMapper;
TriMapper tri(_tri, triStride); TriMapper tri(_tri, triStride);
OtherMapper other(_other, otherStride); OtherMapper other(_other, otherStride, otherIncr);
typedef gebp_traits<Scalar,Scalar> Traits; typedef gebp_traits<Scalar,Scalar> Traits;
@ -128,19 +128,19 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
{ {
Scalar b(0); Scalar b(0);
const Scalar* l = &tri(i,s); const Scalar* l = &tri(i,s);
Scalar* r = &other(s,j); typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j);
for (Index i3=0; i3<k; ++i3) for (Index i3=0; i3<k; ++i3)
b += conj(l[i3]) * r[i3]; b += conj(l[i3]) * r(i3);
other(i,j) = (other(i,j) - b)*a; other(i,j) = (other(i,j) - b)*a;
} }
else else
{ {
Scalar b = (other(i,j) *= a); Scalar b = (other(i,j) *= a);
Scalar* r = &other(s,j); typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j);
const Scalar* l = &tri(s,i); typename TriMapper::LinearMapper l = tri.getLinearMapper(s,i);
for (Index i3=0;i3<rs;++i3) for (Index i3=0;i3<rs;++i3)
r[i3] -= b * conj(l[i3]); r(i3) -= b * conj(l(i3));
} }
} }
} }
@ -185,28 +185,28 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
/* Optimized triangular solver with multiple left hand sides and the triangular matrix on the right /* Optimized triangular solver with multiple left hand sides and the triangular matrix on the right
*/ */
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder> template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>
{ {
static EIGEN_DONT_INLINE void run( static EIGEN_DONT_INLINE void run(
Index size, Index otherSize, Index size, Index otherSize,
const Scalar* _tri, Index triStride, const Scalar* _tri, Index triStride,
Scalar* _other, Index otherStride, Scalar* _other, Index otherIncr, Index otherStride,
level3_blocking<Scalar,Scalar>& blocking); level3_blocking<Scalar,Scalar>& blocking);
}; };
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder> template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>::run( EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run(
Index size, Index otherSize, Index size, Index otherSize,
const Scalar* _tri, Index triStride, const Scalar* _tri, Index triStride,
Scalar* _other, Index otherStride, Scalar* _other, Index otherIncr, Index otherStride,
level3_blocking<Scalar,Scalar>& blocking) level3_blocking<Scalar,Scalar>& blocking)
{ {
Index rows = otherSize; Index rows = otherSize;
typedef typename NumTraits<Scalar>::Real RealScalar; typedef typename NumTraits<Scalar>::Real RealScalar;
typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper; typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper; typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
LhsMapper lhs(_other, otherStride); LhsMapper lhs(_other, otherStride, otherIncr);
RhsMapper rhs(_tri, triStride); RhsMapper rhs(_tri, triStride);
typedef gebp_traits<Scalar,Scalar> Traits; typedef gebp_traits<Scalar,Scalar> Traits;
@ -297,24 +297,24 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
{ {
Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k; Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k;
Scalar* r = &lhs(i2,j); typename LhsMapper::LinearMapper r = lhs.getLinearMapper(i2,j);
for (Index k3=0; k3<k; ++k3) for (Index k3=0; k3<k; ++k3)
{ {
Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j)); Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j));
Scalar* a = &lhs(i2,IsLower ? j+1+k3 : absolute_j2+k3); typename LhsMapper::LinearMapper a = lhs.getLinearMapper(i2,IsLower ? j+1+k3 : absolute_j2+k3);
for (Index i=0; i<actual_mc; ++i) for (Index i=0; i<actual_mc; ++i)
r[i] -= a[i] * b; r(i) -= a(i) * b;
} }
if((Mode & UnitDiag)==0) if((Mode & UnitDiag)==0)
{ {
Scalar inv_rjj = RealScalar(1)/conj(rhs(j,j)); Scalar inv_rjj = RealScalar(1)/conj(rhs(j,j));
for (Index i=0; i<actual_mc; ++i) for (Index i=0; i<actual_mc; ++i)
r[i] *= inv_rjj; r(i) *= inv_rjj;
} }
} }
// pack the just computed part of lhs to A // pack the just computed part of lhs to A
pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride), pack_lhs_panel(blockA, lhs.getSubMapper(i2,absolute_j2),
actualPanelWidth, actual_mc, actualPanelWidth, actual_mc,
actual_kc, j2); actual_kc, j2);
} }

View File

@ -38,9 +38,9 @@ namespace Eigen {
namespace internal { namespace internal {
// implements LeftSide op(triangular)^-1 * general // implements LeftSide op(triangular)^-1 * general
#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \ #define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \
template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \ template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,1> \
{ \ { \
enum { \ enum { \
IsLower = (Mode&Lower) == Lower, \ IsLower = (Mode&Lower) == Lower, \
@ -51,8 +51,10 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage
static void run( \ static void run( \
Index size, Index otherSize, \ Index size, Index otherSize, \
const EIGTYPE* _tri, Index triStride, \ const EIGTYPE* _tri, Index triStride, \
EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \ EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
{ \ { \
EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \
eigen_assert(otherIncr == 1); \
BlasIndex m = convert_index<BlasIndex>(size), n = convert_index<BlasIndex>(otherSize), lda, ldb; \ BlasIndex m = convert_index<BlasIndex>(size), n = convert_index<BlasIndex>(otherSize), lda, ldb; \
char side = 'L', uplo, diag='N', transa; \ char side = 'L', uplo, diag='N', transa; \
/* Set alpha_ */ \ /* Set alpha_ */ \
@ -80,20 +82,26 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage
} \ } \
if (IsUnitDiag) diag='U'; \ if (IsUnitDiag) diag='U'; \
/* call ?trsm*/ \ /* call ?trsm*/ \
BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
} \ } \
}; };
EIGEN_BLAS_TRSM_L(double, double, d) #ifdef EIGEN_USE_MKL
EIGEN_BLAS_TRSM_L(dcomplex, double, z) EIGEN_BLAS_TRSM_L(double, double, dtrsm)
EIGEN_BLAS_TRSM_L(float, float, s) EIGEN_BLAS_TRSM_L(dcomplex, MKL_Complex16, ztrsm)
EIGEN_BLAS_TRSM_L(scomplex, float, c) EIGEN_BLAS_TRSM_L(float, float, strsm)
EIGEN_BLAS_TRSM_L(scomplex, MKL_Complex8, ctrsm)
#else
EIGEN_BLAS_TRSM_L(double, double, dtrsm_)
EIGEN_BLAS_TRSM_L(dcomplex, double, ztrsm_)
EIGEN_BLAS_TRSM_L(float, float, strsm_)
EIGEN_BLAS_TRSM_L(scomplex, float, ctrsm_)
#endif
// implements RightSide general * op(triangular)^-1 // implements RightSide general * op(triangular)^-1
#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASPREFIX) \ #define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASFUNC) \
template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \ template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,1> \
{ \ { \
enum { \ enum { \
IsLower = (Mode&Lower) == Lower, \ IsLower = (Mode&Lower) == Lower, \
@ -104,8 +112,10 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag
static void run( \ static void run( \
Index size, Index otherSize, \ Index size, Index otherSize, \
const EIGTYPE* _tri, Index triStride, \ const EIGTYPE* _tri, Index triStride, \
EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \ EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
{ \ { \
EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \
eigen_assert(otherIncr == 1); \
BlasIndex m = convert_index<BlasIndex>(otherSize), n = convert_index<BlasIndex>(size), lda, ldb; \ BlasIndex m = convert_index<BlasIndex>(otherSize), n = convert_index<BlasIndex>(size), lda, ldb; \
char side = 'R', uplo, diag='N', transa; \ char side = 'R', uplo, diag='N', transa; \
/* Set alpha_ */ \ /* Set alpha_ */ \
@ -133,16 +143,22 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag
} \ } \
if (IsUnitDiag) diag='U'; \ if (IsUnitDiag) diag='U'; \
/* call ?trsm*/ \ /* call ?trsm*/ \
BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
/*std::cout << "TRMS_L specialization!\n";*/ \ /*std::cout << "TRMS_L specialization!\n";*/ \
} \ } \
}; };
EIGEN_BLAS_TRSM_R(double, double, d) #ifdef EIGEN_USE_MKL
EIGEN_BLAS_TRSM_R(dcomplex, double, z) EIGEN_BLAS_TRSM_R(double, double, dtrsm)
EIGEN_BLAS_TRSM_R(float, float, s) EIGEN_BLAS_TRSM_R(dcomplex, MKL_Complex16, ztrsm)
EIGEN_BLAS_TRSM_R(scomplex, float, c) EIGEN_BLAS_TRSM_R(float, float, strsm)
EIGEN_BLAS_TRSM_R(scomplex, MKL_Complex8, ctrsm)
#else
EIGEN_BLAS_TRSM_R(double, double, dtrsm_)
EIGEN_BLAS_TRSM_R(dcomplex, double, ztrsm_)
EIGEN_BLAS_TRSM_R(float, float, strsm_)
EIGEN_BLAS_TRSM_R(scomplex, float, ctrsm_)
#endif
} // end namespace internal } // end namespace internal

View File

@ -31,7 +31,7 @@ template<
typename Index, typename Index,
typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
int ResStorageOrder> int ResStorageOrder, int ResInnerStride>
struct general_matrix_matrix_product; struct general_matrix_matrix_product;
template<typename Index, template<typename Index,
@ -155,13 +155,21 @@ class BlasVectorMapper {
Scalar* m_data; Scalar* m_data;
}; };
template<typename Scalar, typename Index, int AlignmentType, int Incr=1>
class BlasLinearMapper;
template<typename Scalar, typename Index, int AlignmentType> template<typename Scalar, typename Index, int AlignmentType>
class BlasLinearMapper { class BlasLinearMapper<Scalar,Index,AlignmentType,1> {
public: public:
typedef typename packet_traits<Scalar>::type Packet; typedef typename packet_traits<Scalar>::type Packet;
typedef typename packet_traits<Scalar>::half HalfPacket; typedef typename packet_traits<Scalar>::half HalfPacket;
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data, Index incr=1)
: m_data(data)
{
EIGEN_ONLY_USED_FOR_DEBUG(incr);
eigen_assert(incr==1);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
internal::prefetch(&operator()(i)); internal::prefetch(&operator()(i));
@ -188,16 +196,25 @@ class BlasLinearMapper {
}; };
// Lightweight helper class to access matrix coefficients. // Lightweight helper class to access matrix coefficients.
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned> template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned, int Incr = 1>
class blas_data_mapper { class blas_data_mapper;
public:
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType>
class blas_data_mapper<Scalar,Index,StorageOrder,AlignmentType,1>
{
public:
typedef typename packet_traits<Scalar>::type Packet; typedef typename packet_traits<Scalar>::type Packet;
typedef typename packet_traits<Scalar>::half HalfPacket; typedef typename packet_traits<Scalar>::half HalfPacket;
typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper; typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
typedef BlasVectorMapper<Scalar, Index> VectorMapper; typedef BlasVectorMapper<Scalar, Index> VectorMapper;
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr=1)
: m_data(data), m_stride(stride)
{
EIGEN_ONLY_USED_FOR_DEBUG(incr);
eigen_assert(incr==1);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
getSubMapper(Index i, Index j) const { getSubMapper(Index i, Index j) const {
@ -251,6 +268,90 @@ class blas_data_mapper {
const Index m_stride; const Index m_stride;
}; };
// Implementation of non-natural increment (i.e. inner-stride != 1)
// The exposed API is not complete yet compared to the Incr==1 case
// because some features makes less sense in this case.
template<typename Scalar, typename Index, int AlignmentType, int Incr>
class BlasLinearMapper
{
public:
typedef typename packet_traits<Scalar>::type Packet;
typedef typename packet_traits<Scalar>::half HalfPacket;
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data,Index incr) : m_data(data), m_incr(incr) {}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
internal::prefetch(&operator()(i));
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
return m_data[i*m_incr.value()];
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
return pgather<Scalar,Packet>(m_data + i*m_incr.value(), m_incr.value());
}
template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
pscatter<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value());
}
protected:
Scalar *m_data;
const internal::variable_if_dynamic<Index,Incr> m_incr;
};
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType,int Incr>
class blas_data_mapper
{
public:
typedef typename packet_traits<Scalar>::type Packet;
typedef typename packet_traits<Scalar>::half HalfPacket;
typedef BlasLinearMapper<Scalar, Index, AlignmentType,Incr> LinearMapper;
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr) : m_data(data), m_stride(stride), m_incr(incr) {}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper
getSubMapper(Index i, Index j) const {
return blas_data_mapper(&operator()(i, j), m_stride, m_incr.value());
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
return LinearMapper(&operator()(i, j), m_incr.value());
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
return m_data[StorageOrder==RowMajor ? j*m_incr.value() + i*m_stride : i*m_incr.value() + j*m_stride];
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
return pgather<Scalar,Packet>(&operator()(i, j),m_incr.value());
}
template <typename PacketT, int AlignmentT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
return pgather<Scalar,PacketT>(&operator()(i, j),m_incr.value());
}
template<typename SubPacket>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
}
template<typename SubPacket>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
}
protected:
Scalar* EIGEN_RESTRICT m_data;
const Index m_stride;
const internal::variable_if_dynamic<Index,Incr> m_incr;
};
// lightweight helper class to access matrix coefficients (const version) // lightweight helper class to access matrix coefficients (const version)
template<typename Scalar, typename Index, int StorageOrder> template<typename Scalar, typename Index, int StorageOrder>
class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> { class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {

View File

@ -43,13 +43,24 @@
#endif #endif
#pragma clang diagnostic ignored "-Wconstant-logical-operand" #pragma clang diagnostic ignored "-Wconstant-logical-operand"
#elif defined __GNUC__ && __GNUC__>=6 #elif defined __GNUC__
#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
#pragma GCC diagnostic push #pragma GCC diagnostic push
#endif #endif
#pragma GCC diagnostic ignored "-Wignored-attributes" // g++ warns about local variables shadowing member functions, which is too strict
#pragma GCC diagnostic ignored "-Wshadow"
#if __GNUC__ == 4 && __GNUC_MINOR__ < 8
// Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
#pragma GCC diagnostic ignored "-Wtype-limits"
#endif
#if __GNUC__>=6
#pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
#if __GNUC__==7
// See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
#pragma GCC diagnostic ignored "-Wattributes"
#endif
#endif #endif
#if defined __NVCC__ #if defined __NVCC__
@ -72,4 +83,12 @@
#pragma diag_suppress 2737 #pragma diag_suppress 2737
#endif #endif
#else
// warnings already disabled:
# ifndef EIGEN_WARNINGS_DISABLED_2
# define EIGEN_WARNINGS_DISABLED_2
# elif defined(EIGEN_INTERNAL_DEBUGGING)
# error "Do not include \"DisableStupidWarnings.h\" recursively more than twice!"
# endif
#endif // not EIGEN_WARNINGS_DISABLED #endif // not EIGEN_WARNINGS_DISABLED

View File

@ -47,11 +47,7 @@ template<typename T> struct NumTraits;
template<typename Derived> struct EigenBase; template<typename Derived> struct EigenBase;
template<typename Derived> class DenseBase; template<typename Derived> class DenseBase;
template<typename Derived> class PlainObjectBase; template<typename Derived> class PlainObjectBase;
template<typename Derived, int Level> class DenseCoeffsBase;
template<typename Derived,
int Level = internal::accessors_level<Derived>::value >
class DenseCoeffsBase;
template<typename _Scalar, int _Rows, int _Cols, template<typename _Scalar, int _Rows, int _Cols,
int _Options = AutoAlign | int _Options = AutoAlign |

View File

@ -49,10 +49,11 @@
#define EIGEN_USE_LAPACKE #define EIGEN_USE_LAPACKE
#endif #endif
#if defined(EIGEN_USE_MKL_VML) #if defined(EIGEN_USE_MKL_VML) && !defined(EIGEN_USE_MKL)
#define EIGEN_USE_MKL #define EIGEN_USE_MKL
#endif #endif
#if defined EIGEN_USE_MKL #if defined EIGEN_USE_MKL
# include <mkl.h> # include <mkl.h>
/*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/ /*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
@ -108,6 +109,10 @@
#endif #endif
#endif #endif
#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL)
#include "../../misc/blas.h"
#endif
namespace Eigen { namespace Eigen {
typedef std::complex<double> dcomplex; typedef std::complex<double> dcomplex;
@ -121,8 +126,5 @@ typedef int BlasIndex;
} // end namespace Eigen } // end namespace Eigen
#if defined(EIGEN_USE_BLAS)
#include "../../misc/blas.h"
#endif
#endif // EIGEN_MKL_SUPPORT_H #endif // EIGEN_MKL_SUPPORT_H

View File

@ -13,7 +13,7 @@
#define EIGEN_WORLD_VERSION 3 #define EIGEN_WORLD_VERSION 3
#define EIGEN_MAJOR_VERSION 3 #define EIGEN_MAJOR_VERSION 3
#define EIGEN_MINOR_VERSION 2 #define EIGEN_MINOR_VERSION 9
#define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
(EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@ -80,8 +80,8 @@
// 2015 14 1900 // 2015 14 1900
// "15" 15 1900 // "15" 15 1900
/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl
#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC) #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG)
#define EIGEN_COMP_MSVC_STRICT _MSC_VER #define EIGEN_COMP_MSVC_STRICT _MSC_VER
#else #else
#define EIGEN_COMP_MSVC_STRICT 0 #define EIGEN_COMP_MSVC_STRICT 0
@ -380,7 +380,8 @@
#if EIGEN_MAX_CPP_VER>=11 && \ #if EIGEN_MAX_CPP_VER>=11 && \
((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \ ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \
|| (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \ || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \
|| (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) \
|| (EIGEN_COMP_MSVC >= 1900) )
#define EIGEN_HAS_C99_MATH 1 #define EIGEN_HAS_C99_MATH 1
#else #else
#define EIGEN_HAS_C99_MATH 0 #define EIGEN_HAS_C99_MATH 0
@ -396,10 +397,24 @@
#endif #endif
#endif #endif
// Does the compiler support type_traits?
// - full support of type traits was added only to GCC 5.1.0.
// - 20150626 corresponds to the last release of 4.x libstdc++
#ifndef EIGEN_HAS_TYPE_TRAITS
#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700) \
&& ((!EIGEN_COMP_GNUC_STRICT) || EIGEN_GNUC_AT_LEAST(5, 1)) \
&& ((!defined(__GLIBCXX__)) || __GLIBCXX__ > 20150626)
#define EIGEN_HAS_TYPE_TRAITS 1
#define EIGEN_INCLUDE_TYPE_TRAITS
#else
#define EIGEN_HAS_TYPE_TRAITS 0
#endif
#endif
// Does the compiler support variadic templates? // Does the compiler support variadic templates?
#ifndef EIGEN_HAS_VARIADIC_TEMPLATES #ifndef EIGEN_HAS_VARIADIC_TEMPLATES
#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \ #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
&& ( !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) ) && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) )
// ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices: // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
// this prevents nvcc from crashing when compiling Eigen on Tegra X1 // this prevents nvcc from crashing when compiling Eigen on Tegra X1
#define EIGEN_HAS_VARIADIC_TEMPLATES 1 #define EIGEN_HAS_VARIADIC_TEMPLATES 1
@ -413,7 +428,7 @@
#ifdef __CUDACC__ #ifdef __CUDACC__
// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500)) #if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500))
#define EIGEN_HAS_CONSTEXPR 1 #define EIGEN_HAS_CONSTEXPR 1
#endif #endif
#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
@ -487,11 +502,13 @@
// EIGEN_STRONG_INLINE is a stronger version of the inline, using __forceinline on MSVC, // EIGEN_STRONG_INLINE is a stronger version of the inline, using __forceinline on MSVC,
// but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline // but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline
// but GCC is still doing fine with just inline. // but GCC is still doing fine with just inline.
#ifndef EIGEN_STRONG_INLINE
#if EIGEN_COMP_MSVC || EIGEN_COMP_ICC #if EIGEN_COMP_MSVC || EIGEN_COMP_ICC
#define EIGEN_STRONG_INLINE __forceinline #define EIGEN_STRONG_INLINE __forceinline
#else #else
#define EIGEN_STRONG_INLINE inline #define EIGEN_STRONG_INLINE inline
#endif #endif
#endif
// EIGEN_ALWAYS_INLINE is the stronget, it has the effect of making the function inline and adding every possible // EIGEN_ALWAYS_INLINE is the stronget, it has the effect of making the function inline and adding every possible
// attribute to maximize inlining. This should only be used when really necessary: in particular, // attribute to maximize inlining. This should only be used when really necessary: in particular,
@ -812,7 +829,8 @@ namespace Eigen {
// just an empty macro ! // just an empty macro !
#define EIGEN_EMPTY #define EIGEN_EMPTY
#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || __CUDACC_VER__) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) #if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0)
// for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
using Base::operator =; using Base::operator =;
#elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
@ -832,11 +850,48 @@ namespace Eigen {
#endif #endif
/**
* \internal
* \brief Macro to explicitly define the default copy constructor.
* This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden.
*/
#if EIGEN_HAS_CXX11
#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default;
#else
#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS)
#endif
/** \internal /** \internal
* \brief Macro to manually inherit assignment operators. * \brief Macro to manually inherit assignment operators.
* This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined. * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
* With C++11 or later this also default-implements the copy-constructor
*/ */
#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) #define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived)
/** \internal
* \brief Macro to manually define default constructors and destructors.
* This is necessary when the copy constructor is re-defined.
* For empty helper classes this should usually be protected, to avoid accidentally creating empty objects.
*
* Hiding the default destructor lead to problems in C++03 mode together with boost::multiprecision
*/
#if EIGEN_HAS_CXX11
#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \
EIGEN_DEVICE_FUNC Derived() = default; \
EIGEN_DEVICE_FUNC ~Derived() = default;
#else
#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \
EIGEN_DEVICE_FUNC Derived() {}; \
/* EIGEN_DEVICE_FUNC ~Derived() {}; */
#endif
/** /**
* Just a side note. Commenting within defines works only by documenting * Just a side note. Commenting within defines works only by documenting
@ -986,7 +1041,13 @@ namespace Eigen {
# define EIGEN_NOEXCEPT # define EIGEN_NOEXCEPT
# define EIGEN_NOEXCEPT_IF(x) # define EIGEN_NOEXCEPT_IF(x)
# define EIGEN_NO_THROW throw() # define EIGEN_NO_THROW throw()
# define EIGEN_EXCEPTION_SPEC(X) throw(X) # if EIGEN_COMP_MSVC
// MSVC does not support exception specifications (warning C4290),
// and they are deprecated in c++11 anyway.
# define EIGEN_EXCEPTION_SPEC(X) throw()
# else
# define EIGEN_EXCEPTION_SPEC(X) throw(X)
# endif
#endif #endif
#endif // EIGEN_MACROS_H #endif // EIGEN_MACROS_H

View File

@ -70,7 +70,7 @@ inline void throw_std_bad_alloc()
throw std::bad_alloc(); throw std::bad_alloc();
#else #else
std::size_t huge = static_cast<std::size_t>(-1); std::size_t huge = static_cast<std::size_t>(-1);
new int[huge]; ::operator new(huge);
#endif #endif
} }
@ -150,7 +150,7 @@ EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements. /** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
* On allocation error, the returned pointer is null, and std::bad_alloc is thrown. * On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
*/ */
EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size) EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
{ {
check_that_malloc_is_allowed(); check_that_malloc_is_allowed();
@ -185,7 +185,7 @@ EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
* \brief Reallocates an aligned block of memory. * \brief Reallocates an aligned block of memory.
* \throws std::bad_alloc on allocation failure * \throws std::bad_alloc on allocation failure
*/ */
inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size) inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size)
{ {
EIGEN_UNUSED_VARIABLE(old_size); EIGEN_UNUSED_VARIABLE(old_size);
@ -209,12 +209,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
/** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned. /** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
* On allocation error, the returned pointer is null, and a std::bad_alloc is thrown. * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
*/ */
template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size) template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std::size_t size)
{ {
return aligned_malloc(size); return aligned_malloc(size);
} }
template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(size_t size) template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std::size_t size)
{ {
check_that_malloc_is_allowed(); check_that_malloc_is_allowed();
@ -235,12 +235,12 @@ template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *p
std::free(ptr); std::free(ptr);
} }
template<bool Align> inline void* conditional_aligned_realloc(void* ptr, size_t new_size, size_t old_size) template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size)
{ {
return aligned_realloc(ptr, new_size, old_size); return aligned_realloc(ptr, new_size, old_size);
} }
template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new_size, size_t) template<> inline void* conditional_aligned_realloc<false>(void* ptr, std::size_t new_size, std::size_t)
{ {
return std::realloc(ptr, new_size); return std::realloc(ptr, new_size);
} }
@ -252,7 +252,7 @@ template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new
/** \internal Destructs the elements of an array. /** \internal Destructs the elements of an array.
* The \a size parameters tells on how many objects to call the destructor of T. * The \a size parameters tells on how many objects to call the destructor of T.
*/ */
template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, size_t size) template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, std::size_t size)
{ {
// always destruct an array starting from the end. // always destruct an array starting from the end.
if(ptr) if(ptr)
@ -262,9 +262,9 @@ template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T
/** \internal Constructs the elements of an array. /** \internal Constructs the elements of an array.
* The \a size parameter tells on how many objects to call the constructor of T. * The \a size parameter tells on how many objects to call the constructor of T.
*/ */
template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, size_t size) template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, std::size_t size)
{ {
size_t i; std::size_t i;
EIGEN_TRY EIGEN_TRY
{ {
for (i = 0; i < size; ++i) ::new (ptr + i) T; for (i = 0; i < size; ++i) ::new (ptr + i) T;
@ -283,9 +283,9 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *
*****************************************************************************/ *****************************************************************************/
template<typename T> template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t size)
{ {
if(size > size_t(-1) / sizeof(T)) if(size > std::size_t(-1) / sizeof(T))
throw_std_bad_alloc(); throw_std_bad_alloc();
} }
@ -293,7 +293,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
* On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown. * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown.
* The default constructor of T is called. * The default constructor of T is called.
*/ */
template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size) template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size)
{ {
check_size_for_overflow<T>(size); check_size_for_overflow<T>(size);
T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size)); T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
@ -309,7 +309,7 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
return result; return result;
} }
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size) template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(std::size_t size)
{ {
check_size_for_overflow<T>(size); check_size_for_overflow<T>(size);
T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size)); T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
@ -328,7 +328,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
/** \internal Deletes objects constructed with aligned_new /** \internal Deletes objects constructed with aligned_new
* The \a size parameters tells on how many objects to call the destructor of T. * The \a size parameters tells on how many objects to call the destructor of T.
*/ */
template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t size) template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, std::size_t size)
{ {
destruct_elements_of_array<T>(ptr, size); destruct_elements_of_array<T>(ptr, size);
aligned_free(ptr); aligned_free(ptr);
@ -337,13 +337,13 @@ template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t
/** \internal Deletes objects constructed with conditional_aligned_new /** \internal Deletes objects constructed with conditional_aligned_new
* The \a size parameters tells on how many objects to call the destructor of T. * The \a size parameters tells on how many objects to call the destructor of T.
*/ */
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, size_t size) template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, std::size_t size)
{ {
destruct_elements_of_array<T>(ptr, size); destruct_elements_of_array<T>(ptr, size);
conditional_aligned_free<Align>(ptr); conditional_aligned_free<Align>(ptr);
} }
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size) template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, std::size_t new_size, std::size_t old_size)
{ {
check_size_for_overflow<T>(new_size); check_size_for_overflow<T>(new_size);
check_size_for_overflow<T>(old_size); check_size_for_overflow<T>(old_size);
@ -366,7 +366,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
} }
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(size_t size) template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(std::size_t size)
{ {
if(size==0) if(size==0)
return 0; // short-cut. Also fixes Bug 884 return 0; // short-cut. Also fixes Bug 884
@ -387,7 +387,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
return result; return result;
} }
template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(T* pts, size_t new_size, size_t old_size) template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size)
{ {
check_size_for_overflow<T>(new_size); check_size_for_overflow<T>(new_size);
check_size_for_overflow<T>(old_size); check_size_for_overflow<T>(old_size);
@ -409,7 +409,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(
return result; return result;
} }
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, size_t size) template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, std::size_t size)
{ {
if(NumTraits<T>::RequireInitialization) if(NumTraits<T>::RequireInitialization)
destruct_elements_of_array<T>(ptr, size); destruct_elements_of_array<T>(ptr, size);
@ -493,7 +493,7 @@ template<typename T> struct smart_copy_helper<T,true> {
IntPtr size = IntPtr(end)-IntPtr(start); IntPtr size = IntPtr(end)-IntPtr(start);
if(size==0) return; if(size==0) return;
eigen_internal_assert(start!=0 && end!=0 && target!=0); eigen_internal_assert(start!=0 && end!=0 && target!=0);
memcpy(target, start, size); std::memcpy(target, start, size);
} }
}; };
@ -561,7 +561,7 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
* In this case, the buffer elements will also be destructed when this handler will be destructed. * In this case, the buffer elements will also be destructed when this handler will be destructed.
* Finally, if \a dealloc is true, then the pointer \a ptr is freed. * Finally, if \a dealloc is true, then the pointer \a ptr is freed.
**/ **/
aligned_stack_memory_handler(T* ptr, size_t size, bool dealloc) aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc)
: m_ptr(ptr), m_size(size), m_deallocate(dealloc) : m_ptr(ptr), m_size(size), m_deallocate(dealloc)
{ {
if(NumTraits<T>::RequireInitialization && m_ptr) if(NumTraits<T>::RequireInitialization && m_ptr)
@ -576,7 +576,7 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
} }
protected: protected:
T* m_ptr; T* m_ptr;
size_t m_size; std::size_t m_size;
bool m_deallocate; bool m_deallocate;
}; };
@ -655,15 +655,15 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
#if EIGEN_MAX_ALIGN_BYTES!=0 #if EIGEN_MAX_ALIGN_BYTES!=0
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
void* operator new(size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \ void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \
EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \ EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
EIGEN_CATCH (...) { return 0; } \ EIGEN_CATCH (...) { return 0; } \
} }
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \ #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \
void *operator new(size_t size) { \ void *operator new(std::size_t size) { \
return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \ return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
} \ } \
void *operator new[](size_t size) { \ void *operator new[](std::size_t size) { \
return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \ return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
} \ } \
void operator delete(void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \ void operator delete(void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
@ -673,8 +673,8 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
/* in-place new and delete. since (at least afaik) there is no actual */ \ /* in-place new and delete. since (at least afaik) there is no actual */ \
/* memory allocated we can safely let the default implementation handle */ \ /* memory allocated we can safely let the default implementation handle */ \
/* this particular case. */ \ /* this particular case. */ \
static void *operator new(size_t size, void *ptr) { return ::operator new(size,ptr); } \ static void *operator new(std::size_t size, void *ptr) { return ::operator new(size,ptr); } \
static void *operator new[](size_t size, void* ptr) { return ::operator new[](size,ptr); } \ static void *operator new[](std::size_t size, void* ptr) { return ::operator new[](size,ptr); } \
void operator delete(void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete(memory,ptr); } \ void operator delete(void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete(memory,ptr); } \
void operator delete[](void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete[](memory,ptr); } \ void operator delete[](void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete[](memory,ptr); } \
/* nothrow-new (returns zero instead of std::bad_alloc) */ \ /* nothrow-new (returns zero instead of std::bad_alloc) */ \
@ -696,7 +696,15 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
/** \class aligned_allocator /** \class aligned_allocator
* \ingroup Core_Module * \ingroup Core_Module
* *
* \brief STL compatible allocator to use with with 16 byte aligned types * \brief STL compatible allocator to use with types requiring a non standrad alignment.
*
* The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd.
* By default, it will thus provide at least 16 bytes alignment and more in following cases:
* - 32 bytes alignment if AVX is enabled.
* - 64 bytes alignment if AVX512 is enabled.
*
* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented
* \link TopicPreprocessorDirectivesPerformance there \endlink.
* *
* Example: * Example:
* \code * \code
@ -713,7 +721,7 @@ template<class T>
class aligned_allocator : public std::allocator<T> class aligned_allocator : public std::allocator<T>
{ {
public: public:
typedef size_t size_type; typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type; typedef std::ptrdiff_t difference_type;
typedef T* pointer; typedef T* pointer;
typedef const T* const_pointer; typedef const T* const_pointer;
@ -739,7 +747,15 @@ public:
pointer allocate(size_type num, const void* /*hint*/ = 0) pointer allocate(size_type num, const void* /*hint*/ = 0)
{ {
internal::check_size_for_overflow<T>(num); internal::check_size_for_overflow<T>(num);
return static_cast<pointer>( internal::aligned_malloc(num * sizeof(T)) ); size_type size = num * sizeof(T);
#if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0)
// workaround gcc bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544
// It triggered eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807
if(size>=std::size_t((std::numeric_limits<std::ptrdiff_t>::max)()))
return 0;
else
#endif
return static_cast<pointer>( internal::aligned_malloc(size) );
} }
void deallocate(pointer p, size_type /*num*/) void deallocate(pointer p, size_type /*num*/)

Some files were not shown because too many files have changed in this diff Show More