update eigen to 3.3.9
This commit is contained in:
parent
4b8f73d81c
commit
8dc26dbe93
|
@ -9,6 +9,7 @@
|
|||
#define EIGEN_CHOLESKY_MODULE_H
|
||||
|
||||
#include "Core"
|
||||
#include "Jacobi"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
|
@ -31,7 +32,11 @@
|
|||
#include "src/Cholesky/LLT.h"
|
||||
#include "src/Cholesky/LDLT.h"
|
||||
#ifdef EIGEN_USE_LAPACKE
|
||||
#ifdef EIGEN_USE_MKL
|
||||
#include "mkl_lapacke.h"
|
||||
#else
|
||||
#include "src/misc/lapacke.h"
|
||||
#endif
|
||||
#include "src/Cholesky/LLT_LAPACKE.h"
|
||||
#endif
|
||||
|
||||
|
|
|
@ -14,6 +14,22 @@
|
|||
// first thing Eigen does: stop the compiler from committing suicide
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
|
||||
#define EIGEN_CUDACC __CUDACC__
|
||||
#endif
|
||||
|
||||
#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
|
||||
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
|
||||
#endif
|
||||
|
||||
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
|
||||
#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
|
||||
#elif defined(__CUDACC_VER__)
|
||||
#define EIGEN_CUDACC_VER __CUDACC_VER__
|
||||
#else
|
||||
#define EIGEN_CUDACC_VER 0
|
||||
#endif
|
||||
|
||||
// Handle NVCC/CUDA/SYCL
|
||||
#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
|
||||
// Do not try asserts on CUDA and SYCL!
|
||||
|
@ -37,9 +53,9 @@
|
|||
#endif
|
||||
|
||||
#define EIGEN_DEVICE_FUNC __host__ __device__
|
||||
// We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro
|
||||
// We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro
|
||||
// works properly on the device side
|
||||
#include <math_functions.hpp>
|
||||
#include <cuda_runtime.h>
|
||||
#else
|
||||
#define EIGEN_DEVICE_FUNC
|
||||
#endif
|
||||
|
@ -155,6 +171,9 @@
|
|||
#ifdef __AVX512DQ__
|
||||
#define EIGEN_VECTORIZE_AVX512DQ
|
||||
#endif
|
||||
#ifdef __AVX512ER__
|
||||
#define EIGEN_VECTORIZE_AVX512ER
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// include files
|
||||
|
@ -229,7 +248,7 @@
|
|||
#if defined __CUDACC__
|
||||
#define EIGEN_VECTORIZE_CUDA
|
||||
#include <vector_types.h>
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#if EIGEN_CUDACC_VER >= 70500
|
||||
#define EIGEN_HAS_CUDA_FP16
|
||||
#endif
|
||||
#endif
|
||||
|
@ -260,7 +279,10 @@
|
|||
#include <cmath>
|
||||
#include <cassert>
|
||||
#include <functional>
|
||||
#include <sstream>
|
||||
#ifndef EIGEN_NO_IO
|
||||
#include <iosfwd>
|
||||
#endif
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <limits>
|
||||
|
@ -321,12 +343,16 @@ inline static const char *SimdInstructionSetsInUse(void) {
|
|||
#error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information
|
||||
#endif
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
|
||||
// ensure QNX/QCC support
|
||||
using std::size_t;
|
||||
// gcc 4.6.0 wants std:: for ptrdiff_t
|
||||
using std::ptrdiff_t;
|
||||
|
||||
}
|
||||
|
||||
/** \defgroup Core_Module Core module
|
||||
* This is the main module of Eigen providing dense matrix and vector support
|
||||
* (both fixed and dynamic size) with all the features corresponding to a BLAS library
|
||||
|
@ -348,10 +374,13 @@ using std::ptrdiff_t;
|
|||
#include "src/Core/MathFunctions.h"
|
||||
#include "src/Core/GenericPacketMath.h"
|
||||
#include "src/Core/MathFunctionsImpl.h"
|
||||
#include "src/Core/arch/Default/ConjHelper.h"
|
||||
|
||||
#if defined EIGEN_VECTORIZE_AVX512
|
||||
#include "src/Core/arch/SSE/PacketMath.h"
|
||||
#include "src/Core/arch/SSE/MathFunctions.h"
|
||||
#include "src/Core/arch/AVX/PacketMath.h"
|
||||
#include "src/Core/arch/AVX/MathFunctions.h"
|
||||
#include "src/Core/arch/AVX512/PacketMath.h"
|
||||
#include "src/Core/arch/AVX512/MathFunctions.h"
|
||||
#elif defined EIGEN_VECTORIZE_AVX
|
||||
|
@ -363,6 +392,7 @@ using std::ptrdiff_t;
|
|||
#include "src/Core/arch/AVX/MathFunctions.h"
|
||||
#include "src/Core/arch/AVX/Complex.h"
|
||||
#include "src/Core/arch/AVX/TypeCasting.h"
|
||||
#include "src/Core/arch/SSE/TypeCasting.h"
|
||||
#elif defined EIGEN_VECTORIZE_SSE
|
||||
#include "src/Core/arch/SSE/PacketMath.h"
|
||||
#include "src/Core/arch/SSE/MathFunctions.h"
|
||||
|
@ -405,6 +435,7 @@ using std::ptrdiff_t;
|
|||
// on CUDA devices
|
||||
#include "src/Core/arch/CUDA/Complex.h"
|
||||
|
||||
#include "src/Core/IO.h"
|
||||
#include "src/Core/DenseCoeffsBase.h"
|
||||
#include "src/Core/DenseBase.h"
|
||||
#include "src/Core/MatrixBase.h"
|
||||
|
@ -452,7 +483,6 @@ using std::ptrdiff_t;
|
|||
#include "src/Core/Redux.h"
|
||||
#include "src/Core/Visitor.h"
|
||||
#include "src/Core/Fuzzy.h"
|
||||
#include "src/Core/IO.h"
|
||||
#include "src/Core/Swap.h"
|
||||
#include "src/Core/CommaInitializer.h"
|
||||
#include "src/Core/GeneralProduct.h"
|
||||
|
|
|
@ -10,14 +10,14 @@
|
|||
|
||||
#include "Core"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
#include "Cholesky"
|
||||
#include "Jacobi"
|
||||
#include "Householder"
|
||||
#include "LU"
|
||||
#include "Geometry"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
/** \defgroup Eigenvalues_Module Eigenvalues module
|
||||
*
|
||||
*
|
||||
|
@ -45,7 +45,11 @@
|
|||
#include "src/Eigenvalues/GeneralizedEigenSolver.h"
|
||||
#include "src/Eigenvalues/MatrixBaseEigenvalues.h"
|
||||
#ifdef EIGEN_USE_LAPACKE
|
||||
#ifdef EIGEN_USE_MKL
|
||||
#include "mkl_lapacke.h"
|
||||
#else
|
||||
#include "src/misc/lapacke.h"
|
||||
#endif
|
||||
#include "src/Eigenvalues/RealSchur_LAPACKE.h"
|
||||
#include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
|
||||
#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
|
||||
|
|
|
@ -10,12 +10,12 @@
|
|||
|
||||
#include "Core"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
#include "SVD"
|
||||
#include "LU"
|
||||
#include <limits>
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
/** \defgroup Geometry_Module Geometry module
|
||||
*
|
||||
* This module provides support for:
|
||||
|
|
|
@ -28,7 +28,11 @@
|
|||
#include "src/LU/FullPivLU.h"
|
||||
#include "src/LU/PartialPivLU.h"
|
||||
#ifdef EIGEN_USE_LAPACKE
|
||||
#ifdef EIGEN_USE_MKL
|
||||
#include "mkl_lapacke.h"
|
||||
#else
|
||||
#include "src/misc/lapacke.h"
|
||||
#endif
|
||||
#include "src/LU/PartialPivLU_LAPACKE.h"
|
||||
#endif
|
||||
#include "src/LU/Determinant.h"
|
||||
|
|
|
@ -10,12 +10,12 @@
|
|||
|
||||
#include "Core"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
#include "Cholesky"
|
||||
#include "Jacobi"
|
||||
#include "Householder"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
/** \defgroup QR_Module QR module
|
||||
*
|
||||
*
|
||||
|
@ -36,7 +36,11 @@
|
|||
#include "src/QR/ColPivHouseholderQR.h"
|
||||
#include "src/QR/CompleteOrthogonalDecomposition.h"
|
||||
#ifdef EIGEN_USE_LAPACKE
|
||||
#ifdef EIGEN_USE_MKL
|
||||
#include "mkl_lapacke.h"
|
||||
#else
|
||||
#include "src/misc/lapacke.h"
|
||||
#endif
|
||||
#include "src/QR/HouseholderQR_LAPACKE.h"
|
||||
#include "src/QR/ColPivHouseholderQR_LAPACKE.h"
|
||||
#endif
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
void *qMalloc(size_t size)
|
||||
void *qMalloc(std::size_t size)
|
||||
{
|
||||
return Eigen::internal::aligned_malloc(size);
|
||||
}
|
||||
|
@ -24,10 +24,10 @@ void qFree(void *ptr)
|
|||
Eigen::internal::aligned_free(ptr);
|
||||
}
|
||||
|
||||
void *qRealloc(void *ptr, size_t size)
|
||||
void *qRealloc(void *ptr, std::size_t size)
|
||||
{
|
||||
void* newPtr = Eigen::internal::aligned_malloc(size);
|
||||
memcpy(newPtr, ptr, size);
|
||||
std::memcpy(newPtr, ptr, size);
|
||||
Eigen::internal::aligned_free(ptr);
|
||||
return newPtr;
|
||||
}
|
||||
|
|
|
@ -37,7 +37,11 @@
|
|||
#include "src/SVD/JacobiSVD.h"
|
||||
#include "src/SVD/BDCSVD.h"
|
||||
#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
|
||||
#ifdef EIGEN_USE_MKL
|
||||
#include "mkl_lapacke.h"
|
||||
#else
|
||||
#include "src/misc/lapacke.h"
|
||||
#endif
|
||||
#include "src/SVD/JacobiSVD_LAPACKE.h"
|
||||
#endif
|
||||
|
||||
|
|
|
@ -25,7 +25,9 @@
|
|||
|
||||
#include "SparseCore"
|
||||
#include "OrderingMethods"
|
||||
#ifndef EIGEN_MPL2_ONLY
|
||||
#include "SparseCholesky"
|
||||
#endif
|
||||
#include "SparseLU"
|
||||
#include "SparseQR"
|
||||
#include "IterativeLinearSolvers"
|
||||
|
|
|
@ -28,7 +28,6 @@
|
|||
*
|
||||
*/
|
||||
|
||||
#include "OrderingMethods"
|
||||
#include "src/SparseCore/SparseColEtree.h"
|
||||
#include "src/SparseQR/SparseQR.h"
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
#include "Core"
|
||||
#include <deque>
|
||||
|
||||
#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
|
||||
#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
|
||||
|
||||
#define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...)
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
#include "Core"
|
||||
#include <list>
|
||||
|
||||
#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
|
||||
#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
|
||||
|
||||
#define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...)
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
#include "Core"
|
||||
#include <vector>
|
||||
|
||||
#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
|
||||
#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
|
||||
|
||||
#define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...)
|
||||
|
||||
|
|
|
@ -248,7 +248,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
|
|||
/** \brief Reports whether previous computation was successful.
|
||||
*
|
||||
* \returns \c Success if computation was succesful,
|
||||
* \c NumericalIssue if the matrix.appears to be negative.
|
||||
* \c NumericalIssue if the factorization failed because of a zero pivot.
|
||||
*/
|
||||
ComputationInfo info() const
|
||||
{
|
||||
|
@ -305,7 +305,8 @@ template<> struct ldlt_inplace<Lower>
|
|||
if (size <= 1)
|
||||
{
|
||||
transpositions.setIdentity();
|
||||
if (numext::real(mat.coeff(0,0)) > static_cast<RealScalar>(0) ) sign = PositiveSemiDef;
|
||||
if(size==0) sign = ZeroSign;
|
||||
else if (numext::real(mat.coeff(0,0)) > static_cast<RealScalar>(0) ) sign = PositiveSemiDef;
|
||||
else if (numext::real(mat.coeff(0,0)) < static_cast<RealScalar>(0)) sign = NegativeSemiDef;
|
||||
else sign = ZeroSign;
|
||||
return true;
|
||||
|
@ -376,6 +377,8 @@ template<> struct ldlt_inplace<Lower>
|
|||
|
||||
if((rs>0) && pivot_is_valid)
|
||||
A21 /= realAkk;
|
||||
else if(rs>0)
|
||||
ret = ret && (A21.array()==Scalar(0)).all();
|
||||
|
||||
if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed
|
||||
else if(!pivot_is_valid) found_zero_pivot = true;
|
||||
|
@ -568,13 +571,14 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
|
|||
// more precisely, use pseudo-inverse of D (see bug 241)
|
||||
using std::abs;
|
||||
const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
|
||||
// In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
|
||||
// as motivated by LAPACK's xGELSS:
|
||||
// In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min())
|
||||
// and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS:
|
||||
// RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
|
||||
// However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
|
||||
// diagonal element is not well justified and leads to numerical issues in some cases.
|
||||
// Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
|
||||
RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
|
||||
// Using numeric_limits::min() gives us more robustness to denormals.
|
||||
RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
|
||||
|
||||
for (Index i = 0; i < vecD.size(); ++i)
|
||||
{
|
||||
|
|
|
@ -41,14 +41,18 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
|
|||
* Example: \include LLT_example.cpp
|
||||
* Output: \verbinclude LLT_example.out
|
||||
*
|
||||
* \b Performance: for best performance, it is recommended to use a column-major storage format
|
||||
* with the Lower triangular part (the default), or, equivalently, a row-major storage format
|
||||
* with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization
|
||||
* step, and rank-updates can be up to 3 times slower.
|
||||
*
|
||||
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
|
||||
*
|
||||
* Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered.
|
||||
* Therefore, the strict lower part does not have to store correct values.
|
||||
*
|
||||
* \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
|
||||
*/
|
||||
/* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
|
||||
* Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
|
||||
* the strict lower part does not have to store correct values.
|
||||
*/
|
||||
template<typename _MatrixType, int _UpLo> class LLT
|
||||
{
|
||||
public:
|
||||
|
@ -146,7 +150,7 @@ template<typename _MatrixType, int _UpLo> class LLT
|
|||
}
|
||||
|
||||
template<typename Derived>
|
||||
void solveInPlace(MatrixBase<Derived> &bAndX) const;
|
||||
void solveInPlace(const MatrixBase<Derived> &bAndX) const;
|
||||
|
||||
template<typename InputType>
|
||||
LLT& compute(const EigenBase<InputType>& matrix);
|
||||
|
@ -177,7 +181,7 @@ template<typename _MatrixType, int _UpLo> class LLT
|
|||
/** \brief Reports whether previous computation was successful.
|
||||
*
|
||||
* \returns \c Success if computation was succesful,
|
||||
* \c NumericalIssue if the matrix.appears to be negative.
|
||||
* \c NumericalIssue if the matrix.appears not to be positive definite.
|
||||
*/
|
||||
ComputationInfo info() const
|
||||
{
|
||||
|
@ -425,6 +429,7 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
|
|||
eigen_assert(a.rows()==a.cols());
|
||||
const Index size = a.rows();
|
||||
m_matrix.resize(size, size);
|
||||
if (!internal::is_same_dense(m_matrix, a.derived()))
|
||||
m_matrix = a.derived();
|
||||
|
||||
// Compute matrix L1 norm = max abs column sum.
|
||||
|
@ -485,11 +490,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
|
|||
*
|
||||
* This version avoids a copy when the right hand side matrix b is not needed anymore.
|
||||
*
|
||||
* \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
|
||||
* This function will const_cast it, so constness isn't honored here.
|
||||
*
|
||||
* \sa LLT::solve(), MatrixBase::llt()
|
||||
*/
|
||||
template<typename MatrixType, int _UpLo>
|
||||
template<typename Derived>
|
||||
void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
|
||||
void LLT<MatrixType,_UpLo>::solveInPlace(const MatrixBase<Derived> &bAndX) const
|
||||
{
|
||||
eigen_assert(m_isInitialized && "LLT is not initialized.");
|
||||
eigen_assert(m_matrix.rows()==bAndX.rows());
|
||||
|
|
|
@ -153,8 +153,6 @@ class Array
|
|||
: Base(std::move(other))
|
||||
{
|
||||
Base::_check_template_params();
|
||||
if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
|
||||
Base::_set_noalias(other);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
|
||||
|
@ -231,10 +229,16 @@ class Array
|
|||
: Base(other)
|
||||
{ }
|
||||
|
||||
private:
|
||||
struct PrivateType {};
|
||||
public:
|
||||
|
||||
/** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
|
||||
EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other,
|
||||
typename internal::enable_if<internal::is_convertible<typename OtherDerived::Scalar,Scalar>::value,
|
||||
PrivateType>::type = PrivateType())
|
||||
: Base(other.derived())
|
||||
{ }
|
||||
|
||||
|
|
|
@ -153,8 +153,8 @@ template<typename Derived> class ArrayBase
|
|||
// inline void evalTo(Dest& dst) const { dst = matrix(); }
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC
|
||||
ArrayBase() : Base() {}
|
||||
EIGEN_DEFAULT_COPY_CONSTRUCTOR(ArrayBase)
|
||||
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(ArrayBase)
|
||||
|
||||
private:
|
||||
explicit ArrayBase(Index);
|
||||
|
@ -175,7 +175,7 @@ template<typename Derived> class ArrayBase
|
|||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE Derived &
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
|
||||
ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
|
||||
{
|
||||
call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
|
@ -188,7 +188,7 @@ ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
|
|||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE Derived &
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
|
||||
ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
|
||||
{
|
||||
call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
|
@ -201,7 +201,7 @@ ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
|
|||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE Derived &
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
|
||||
ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
|
||||
{
|
||||
call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
|
@ -214,7 +214,7 @@ ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
|
|||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE Derived &
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
|
||||
ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
|
||||
{
|
||||
call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
|
|
|
@ -32,7 +32,8 @@ struct traits<ArrayWrapper<ExpressionType> >
|
|||
// Let's remove NestByRefBit
|
||||
enum {
|
||||
Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
|
||||
Flags = Flags0 & ~NestByRefBit
|
||||
LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
|
||||
Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
|
||||
};
|
||||
};
|
||||
}
|
||||
|
@ -129,7 +130,8 @@ struct traits<MatrixWrapper<ExpressionType> >
|
|||
// Let's remove NestByRefBit
|
||||
enum {
|
||||
Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
|
||||
Flags = Flags0 & ~NestByRefBit
|
||||
LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
|
||||
Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
|
||||
};
|
||||
};
|
||||
}
|
||||
|
|
|
@ -39,7 +39,7 @@ public:
|
|||
enum {
|
||||
DstAlignment = DstEvaluator::Alignment,
|
||||
SrcAlignment = SrcEvaluator::Alignment,
|
||||
DstHasDirectAccess = DstFlags & DirectAccessBit,
|
||||
DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit,
|
||||
JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
|
||||
};
|
||||
|
||||
|
@ -83,7 +83,7 @@ private:
|
|||
&& int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
|
||||
&& (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)),
|
||||
MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
|
||||
MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess
|
||||
MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess)
|
||||
&& (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
|
||||
/* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
|
||||
so it's only good for large enough sizes. */
|
||||
|
@ -515,7 +515,7 @@ struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
|
|||
template<typename Kernel>
|
||||
struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
|
||||
{
|
||||
EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
|
||||
{
|
||||
typedef typename Kernel::Scalar Scalar;
|
||||
typedef typename Kernel::PacketType PacketType;
|
||||
|
@ -563,7 +563,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
|
|||
template<typename Kernel>
|
||||
struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
|
||||
{
|
||||
EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
|
||||
{
|
||||
typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
|
||||
typedef typename Kernel::PacketType PacketType;
|
||||
|
@ -701,6 +701,26 @@ protected:
|
|||
* Part 5 : Entry point for dense rectangular assignment
|
||||
***************************************************************************/
|
||||
|
||||
template<typename DstXprType,typename SrcXprType, typename Functor>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const Functor &/*func*/)
|
||||
{
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(dst);
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(src);
|
||||
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
|
||||
}
|
||||
|
||||
template<typename DstXprType,typename SrcXprType, typename T1, typename T2>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const internal::assign_op<T1,T2> &/*func*/)
|
||||
{
|
||||
Index dstRows = src.rows();
|
||||
Index dstCols = src.cols();
|
||||
if(((dst.rows()!=dstRows) || (dst.cols()!=dstCols)))
|
||||
dst.resize(dstRows, dstCols);
|
||||
eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols);
|
||||
}
|
||||
|
||||
template<typename DstXprType, typename SrcXprType, typename Functor>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func)
|
||||
{
|
||||
|
@ -711,10 +731,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType
|
|||
|
||||
// NOTE To properly handle A = (A*A.transpose())/s with A rectangular,
|
||||
// we need to resize the destination after the source evaluator has been created.
|
||||
Index dstRows = src.rows();
|
||||
Index dstCols = src.cols();
|
||||
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
|
||||
dst.resize(dstRows, dstCols);
|
||||
resize_if_allowed(dst, src, func);
|
||||
|
||||
DstEvaluatorType dstEvaluator(dst);
|
||||
|
||||
|
|
|
@ -84,7 +84,8 @@ class vml_assign_traits
|
|||
struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>, \
|
||||
Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> { \
|
||||
typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType; \
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) { \
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) { \
|
||||
resize_if_allowed(dst, src, func); \
|
||||
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
|
||||
if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) { \
|
||||
VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \
|
||||
|
@ -144,7 +145,8 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _)
|
|||
Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> { \
|
||||
typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested, \
|
||||
const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType; \
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) { \
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) { \
|
||||
resize_if_allowed(dst, src, func); \
|
||||
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
|
||||
VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other); \
|
||||
if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) \
|
||||
|
|
|
@ -160,7 +160,7 @@ rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Deco
|
|||
{
|
||||
typedef typename Decomposition::RealScalar RealScalar;
|
||||
eigen_assert(dec.rows() == dec.cols());
|
||||
if (dec.rows() == 0) return RealScalar(1);
|
||||
if (dec.rows() == 0) return NumTraits<RealScalar>::infinity();
|
||||
if (matrix_norm == RealScalar(0)) return RealScalar(0);
|
||||
if (dec.rows() == 1) return RealScalar(1);
|
||||
const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec);
|
||||
|
|
|
@ -977,7 +977,7 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
|
|||
OuterStrideAtCompileTime = HasSameStorageOrderAsArgType
|
||||
? int(outer_stride_at_compile_time<ArgType>::ret)
|
||||
: int(inner_stride_at_compile_time<ArgType>::ret),
|
||||
MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0,
|
||||
MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0,
|
||||
|
||||
FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
|
||||
FlagsRowMajorBit = XprType::Flags&RowMajorBit,
|
||||
|
@ -987,7 +987,9 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
|
|||
Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
|
||||
|
||||
PacketAlignment = unpacket_traits<PacketScalar>::alignment,
|
||||
Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
|
||||
Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic)
|
||||
&& (OuterStrideAtCompileTime!=0)
|
||||
&& (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
|
||||
Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
|
||||
};
|
||||
typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
|
||||
|
@ -1018,14 +1020,16 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
|
|||
EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
|
||||
: m_argImpl(block.nestedExpression()),
|
||||
m_startRow(block.startRow()),
|
||||
m_startCol(block.startCol())
|
||||
m_startCol(block.startCol()),
|
||||
m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0)
|
||||
{ }
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
|
||||
enum {
|
||||
RowsAtCompileTime = XprType::RowsAtCompileTime
|
||||
RowsAtCompileTime = XprType::RowsAtCompileTime,
|
||||
ForwardLinearAccess = InnerPanel && bool(evaluator<ArgType>::Flags&LinearAccessBit)
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
|
@ -1037,6 +1041,9 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
|
|||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
if (ForwardLinearAccess)
|
||||
return m_argImpl.coeff(m_linear_offset.value() + index);
|
||||
else
|
||||
return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
|
||||
}
|
||||
|
||||
|
@ -1049,6 +1056,9 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
|
|||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Scalar& coeffRef(Index index)
|
||||
{
|
||||
if (ForwardLinearAccess)
|
||||
return m_argImpl.coeffRef(m_linear_offset.value() + index);
|
||||
else
|
||||
return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
|
||||
}
|
||||
|
||||
|
@ -1063,6 +1073,9 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
|
|||
EIGEN_STRONG_INLINE
|
||||
PacketType packet(Index index) const
|
||||
{
|
||||
if (ForwardLinearAccess)
|
||||
return m_argImpl.template packet<LoadMode,PacketType>(m_linear_offset.value() + index);
|
||||
else
|
||||
return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
|
||||
RowsAtCompileTime == 1 ? index : 0);
|
||||
}
|
||||
|
@ -1078,6 +1091,9 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
|
|||
EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketType& x)
|
||||
{
|
||||
if (ForwardLinearAccess)
|
||||
return m_argImpl.template writePacket<StoreMode,PacketType>(m_linear_offset.value() + index, x);
|
||||
else
|
||||
return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
|
||||
RowsAtCompileTime == 1 ? index : 0,
|
||||
x);
|
||||
|
@ -1087,6 +1103,7 @@ protected:
|
|||
evaluator<ArgType> m_argImpl;
|
||||
const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
|
||||
const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
|
||||
const variable_if_dynamic<Index, InnerPanel ? Dynamic : 0> m_linear_offset;
|
||||
};
|
||||
|
||||
// TODO: This evaluator does not actually use the child evaluator;
|
||||
|
@ -1556,9 +1573,7 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
|
|||
{ }
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
// FIXME having to check whether ArgType is sparse here i not very nice.
|
||||
typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value,
|
||||
typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
CoeffReturnType coeff(Index row, Index) const
|
||||
|
|
|
@ -105,7 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
|
|||
*/
|
||||
template<typename Derived>
|
||||
template<typename CustomNullaryOp>
|
||||
EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
|
||||
DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
|
||||
{
|
||||
return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
|
||||
|
@ -150,7 +150,7 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
|
|||
*/
|
||||
template<typename Derived>
|
||||
template<typename CustomNullaryOp>
|
||||
EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
|
||||
DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
|
||||
{
|
||||
return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
|
||||
|
@ -192,7 +192,7 @@ DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
|
|||
* \sa class CwiseNullaryOp
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Constant(Index size, const Scalar& value)
|
||||
{
|
||||
return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value));
|
||||
|
@ -208,7 +208,7 @@ DenseBase<Derived>::Constant(Index size, const Scalar& value)
|
|||
* \sa class CwiseNullaryOp
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Constant(const Scalar& value)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
|
||||
|
@ -220,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
|
|||
* \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
|
||||
DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
|
@ -232,7 +232,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const
|
|||
* \sa LinSpaced(Scalar,Scalar)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
|
||||
DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
|
@ -264,7 +264,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig
|
|||
* \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
|
||||
DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
|
@ -276,7 +276,7 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
|
|||
* Special version for fixed size types which does not require the size parameter.
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
|
||||
DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
|
@ -286,7 +286,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
|
|||
|
||||
/** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
|
||||
template<typename Derived>
|
||||
bool DenseBase<Derived>::isApproxToConstant
|
||||
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApproxToConstant
|
||||
(const Scalar& val, const RealScalar& prec) const
|
||||
{
|
||||
typename internal::nested_eval<Derived,1>::type self(derived());
|
||||
|
@ -301,7 +301,7 @@ bool DenseBase<Derived>::isApproxToConstant
|
|||
*
|
||||
* \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
|
||||
template<typename Derived>
|
||||
bool DenseBase<Derived>::isConstant
|
||||
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isConstant
|
||||
(const Scalar& val, const RealScalar& prec) const
|
||||
{
|
||||
return isApproxToConstant(val, prec);
|
||||
|
@ -312,7 +312,7 @@ bool DenseBase<Derived>::isConstant
|
|||
* \sa setConstant(), Constant(), class CwiseNullaryOp
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
|
||||
{
|
||||
setConstant(val);
|
||||
}
|
||||
|
@ -322,7 +322,7 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
|
|||
* \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
|
||||
{
|
||||
return derived() = Constant(rows(), cols(), val);
|
||||
}
|
||||
|
@ -337,7 +337,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
|
|||
* \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived&
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
|
||||
PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
|
||||
{
|
||||
resize(size);
|
||||
|
@ -356,7 +356,7 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
|
|||
* \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived&
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
|
||||
PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
|
||||
{
|
||||
resize(rows, cols);
|
||||
|
@ -380,7 +380,7 @@ PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
|
|||
* \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize));
|
||||
|
@ -400,7 +400,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
|
|||
* \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
return setLinSpaced(size(), low, high);
|
||||
|
@ -423,7 +423,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low,
|
|||
* \sa Zero(), Zero(Index)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Zero(Index rows, Index cols)
|
||||
{
|
||||
return Constant(rows, cols, Scalar(0));
|
||||
|
@ -446,7 +446,7 @@ DenseBase<Derived>::Zero(Index rows, Index cols)
|
|||
* \sa Zero(), Zero(Index,Index)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Zero(Index size)
|
||||
{
|
||||
return Constant(size, Scalar(0));
|
||||
|
@ -463,7 +463,7 @@ DenseBase<Derived>::Zero(Index size)
|
|||
* \sa Zero(Index), Zero(Index,Index)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Zero()
|
||||
{
|
||||
return Constant(Scalar(0));
|
||||
|
@ -478,7 +478,7 @@ DenseBase<Derived>::Zero()
|
|||
* \sa class CwiseNullaryOp, Zero()
|
||||
*/
|
||||
template<typename Derived>
|
||||
bool DenseBase<Derived>::isZero(const RealScalar& prec) const
|
||||
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isZero(const RealScalar& prec) const
|
||||
{
|
||||
typename internal::nested_eval<Derived,1>::type self(derived());
|
||||
for(Index j = 0; j < cols(); ++j)
|
||||
|
@ -496,7 +496,7 @@ bool DenseBase<Derived>::isZero(const RealScalar& prec) const
|
|||
* \sa class CwiseNullaryOp, Zero()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
|
||||
{
|
||||
return setConstant(Scalar(0));
|
||||
}
|
||||
|
@ -511,7 +511,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
|
|||
* \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived&
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
|
||||
PlainObjectBase<Derived>::setZero(Index newSize)
|
||||
{
|
||||
resize(newSize);
|
||||
|
@ -529,7 +529,7 @@ PlainObjectBase<Derived>::setZero(Index newSize)
|
|||
* \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived&
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
|
||||
PlainObjectBase<Derived>::setZero(Index rows, Index cols)
|
||||
{
|
||||
resize(rows, cols);
|
||||
|
@ -553,7 +553,7 @@ PlainObjectBase<Derived>::setZero(Index rows, Index cols)
|
|||
* \sa Ones(), Ones(Index), isOnes(), class Ones
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Ones(Index rows, Index cols)
|
||||
{
|
||||
return Constant(rows, cols, Scalar(1));
|
||||
|
@ -576,7 +576,7 @@ DenseBase<Derived>::Ones(Index rows, Index cols)
|
|||
* \sa Ones(), Ones(Index,Index), isOnes(), class Ones
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Ones(Index newSize)
|
||||
{
|
||||
return Constant(newSize, Scalar(1));
|
||||
|
@ -593,7 +593,7 @@ DenseBase<Derived>::Ones(Index newSize)
|
|||
* \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Ones()
|
||||
{
|
||||
return Constant(Scalar(1));
|
||||
|
@ -608,7 +608,7 @@ DenseBase<Derived>::Ones()
|
|||
* \sa class CwiseNullaryOp, Ones()
|
||||
*/
|
||||
template<typename Derived>
|
||||
bool DenseBase<Derived>::isOnes
|
||||
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isOnes
|
||||
(const RealScalar& prec) const
|
||||
{
|
||||
return isApproxToConstant(Scalar(1), prec);
|
||||
|
@ -622,7 +622,7 @@ bool DenseBase<Derived>::isOnes
|
|||
* \sa class CwiseNullaryOp, Ones()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
|
||||
{
|
||||
return setConstant(Scalar(1));
|
||||
}
|
||||
|
@ -637,7 +637,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
|
|||
* \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived&
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
|
||||
PlainObjectBase<Derived>::setOnes(Index newSize)
|
||||
{
|
||||
resize(newSize);
|
||||
|
@ -655,7 +655,7 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
|
|||
* \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived&
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
|
||||
PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
|
||||
{
|
||||
resize(rows, cols);
|
||||
|
@ -679,7 +679,7 @@ PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
|
|||
* \sa Identity(), setIdentity(), isIdentity()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
|
||||
MatrixBase<Derived>::Identity(Index rows, Index cols)
|
||||
{
|
||||
return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
|
||||
|
@ -696,7 +696,7 @@ MatrixBase<Derived>::Identity(Index rows, Index cols)
|
|||
* \sa Identity(Index,Index), setIdentity(), isIdentity()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
|
||||
MatrixBase<Derived>::Identity()
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
|
||||
|
@ -771,7 +771,7 @@ struct setIdentity_impl<Derived, true>
|
|||
* \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
|
||||
{
|
||||
return internal::setIdentity_impl<Derived>::run(derived());
|
||||
}
|
||||
|
@ -787,7 +787,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
|
|||
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
|
||||
{
|
||||
derived().resize(rows, cols);
|
||||
return setIdentity();
|
||||
|
@ -800,7 +800,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index
|
|||
* \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i);
|
||||
|
@ -815,7 +815,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
|
|||
* \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
return BasisReturnType(SquareMatrixType::Identity(),i);
|
||||
|
@ -828,7 +828,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
|
|||
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
|
||||
{ return Derived::Unit(0); }
|
||||
|
||||
/** \returns an expression of the Y axis unit vector (0,1{,0}^*)
|
||||
|
@ -838,7 +838,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
|
|||
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
|
||||
{ return Derived::Unit(1); }
|
||||
|
||||
/** \returns an expression of the Z axis unit vector (0,0,1{,0}^*)
|
||||
|
@ -848,7 +848,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
|
|||
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
|
||||
{ return Derived::Unit(2); }
|
||||
|
||||
/** \returns an expression of the W axis unit vector (0,0,0,1)
|
||||
|
@ -858,7 +858,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
|
|||
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
|
||||
{ return Derived::Unit(3); }
|
||||
|
||||
} // end namespace Eigen
|
||||
|
|
|
@ -121,6 +121,8 @@ class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
|
|||
{
|
||||
return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
|
||||
}
|
||||
protected:
|
||||
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(CwiseUnaryViewImpl)
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
|
|
@ -40,7 +40,7 @@ static inline void check_DenseIndex_is_signed() {
|
|||
*/
|
||||
template<typename Derived> class DenseBase
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
: public DenseCoeffsBase<Derived>
|
||||
: public DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value>
|
||||
#else
|
||||
: public DenseCoeffsBase<Derived,DirectWriteAccessors>
|
||||
#endif // not EIGEN_PARSED_BY_DOXYGEN
|
||||
|
@ -71,7 +71,7 @@ template<typename Derived> class DenseBase
|
|||
typedef Scalar value_type;
|
||||
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
typedef DenseCoeffsBase<Derived> Base;
|
||||
typedef DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value> Base;
|
||||
|
||||
using Base::derived;
|
||||
using Base::const_cast_derived;
|
||||
|
@ -296,7 +296,7 @@ template<typename Derived> class DenseBase
|
|||
EIGEN_DEVICE_FUNC
|
||||
Derived& operator=(const ReturnByValue<OtherDerived>& func);
|
||||
|
||||
/** \ínternal
|
||||
/** \internal
|
||||
* Copies \a other into *this without evaluating other. \returns a reference to *this.
|
||||
* \deprecated */
|
||||
template<typename OtherDerived>
|
||||
|
@ -463,7 +463,17 @@ template<typename Derived> class DenseBase
|
|||
EIGEN_DEVICE_FUNC
|
||||
void visit(Visitor& func) const;
|
||||
|
||||
inline const WithFormat<Derived> format(const IOFormat& fmt) const;
|
||||
/** \returns a WithFormat proxy object allowing to print a matrix the with given
|
||||
* format \a fmt.
|
||||
*
|
||||
* See class IOFormat for some examples.
|
||||
*
|
||||
* \sa class IOFormat, class WithFormat
|
||||
*/
|
||||
inline const WithFormat<Derived> format(const IOFormat& fmt) const
|
||||
{
|
||||
return WithFormat<Derived>(derived(), fmt);
|
||||
}
|
||||
|
||||
/** \returns the unique coefficient of a 1x1 expression */
|
||||
EIGEN_DEVICE_FUNC
|
||||
|
@ -474,9 +484,9 @@ template<typename Derived> class DenseBase
|
|||
return derived().coeff(0,0);
|
||||
}
|
||||
|
||||
bool all() const;
|
||||
bool any() const;
|
||||
Index count() const;
|
||||
EIGEN_DEVICE_FUNC bool all() const;
|
||||
EIGEN_DEVICE_FUNC bool any() const;
|
||||
EIGEN_DEVICE_FUNC Index count() const;
|
||||
|
||||
typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
|
||||
typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;
|
||||
|
@ -577,11 +587,12 @@ template<typename Derived> class DenseBase
|
|||
}
|
||||
|
||||
protected:
|
||||
EIGEN_DEFAULT_COPY_CONSTRUCTOR(DenseBase)
|
||||
/** Default constructor. Do nothing. */
|
||||
EIGEN_DEVICE_FUNC DenseBase()
|
||||
{
|
||||
/* Just checks for self-consistency of the flags.
|
||||
* Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down
|
||||
* Only do it when debugging Eigen, as this borders on paranoia and could slow compilation down
|
||||
*/
|
||||
#ifdef EIGEN_INTERNAL_DEBUGGING
|
||||
EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor))
|
||||
|
|
|
@ -13,9 +13,9 @@
|
|||
#define EIGEN_MATRIXSTORAGE_H
|
||||
|
||||
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
|
||||
#define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
|
||||
#define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) X; EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
|
||||
#else
|
||||
#define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
|
||||
#define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X)
|
||||
#endif
|
||||
|
||||
namespace Eigen {
|
||||
|
@ -184,12 +184,16 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
|
|||
{
|
||||
internal::plain_array<T,Size,_Options> m_data;
|
||||
public:
|
||||
EIGEN_DEVICE_FUNC DenseStorage() {}
|
||||
EIGEN_DEVICE_FUNC DenseStorage() {
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
|
||||
: m_data(internal::constructor_without_unaligned_array_assert()) {}
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseStorage(const DenseStorage& other) : m_data(other.m_data) {}
|
||||
DenseStorage(const DenseStorage& other) : m_data(other.m_data) {
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseStorage& operator=(const DenseStorage& other)
|
||||
{
|
||||
|
@ -197,7 +201,7 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
|
|||
return *this;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
|
||||
eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols);
|
||||
EIGEN_UNUSED_VARIABLE(size);
|
||||
EIGEN_UNUSED_VARIABLE(rows);
|
||||
|
@ -343,7 +347,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
|
|||
EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
|
||||
: m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows), m_cols(cols)
|
||||
{
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
|
||||
eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
|
||||
|
@ -351,6 +355,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
|
|||
, m_rows(other.m_rows)
|
||||
, m_cols(other.m_cols)
|
||||
{
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*m_cols)
|
||||
internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
|
||||
|
@ -399,11 +404,11 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
|
|||
if(size != m_rows*m_cols)
|
||||
{
|
||||
internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols);
|
||||
if (size)
|
||||
if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
|
||||
m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
|
||||
else
|
||||
m_data = 0;
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
|
||||
}
|
||||
m_rows = rows;
|
||||
m_cols = cols;
|
||||
|
@ -422,7 +427,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
|
|||
explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
|
||||
EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(cols)
|
||||
{
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
|
||||
eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0);
|
||||
EIGEN_UNUSED_VARIABLE(rows);
|
||||
}
|
||||
|
@ -430,6 +435,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
|
|||
: m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols))
|
||||
, m_cols(other.m_cols)
|
||||
{
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols*_Rows)
|
||||
internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
|
||||
|
@ -473,11 +479,11 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
|
|||
if(size != _Rows*m_cols)
|
||||
{
|
||||
internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols);
|
||||
if (size)
|
||||
if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
|
||||
m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
|
||||
else
|
||||
m_data = 0;
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
|
||||
}
|
||||
m_cols = cols;
|
||||
}
|
||||
|
@ -495,7 +501,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
|
|||
explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
|
||||
EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows)
|
||||
{
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
|
||||
eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols);
|
||||
EIGEN_UNUSED_VARIABLE(cols);
|
||||
}
|
||||
|
@ -503,6 +509,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
|
|||
: m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols))
|
||||
, m_rows(other.m_rows)
|
||||
{
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*_Cols)
|
||||
internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
|
||||
|
@ -546,11 +553,11 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
|
|||
if(size != m_rows*_Cols)
|
||||
{
|
||||
internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows);
|
||||
if (size)
|
||||
if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
|
||||
m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
|
||||
else
|
||||
m_data = 0;
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
|
||||
}
|
||||
m_rows = rows;
|
||||
}
|
||||
|
|
|
@ -21,7 +21,7 @@ namespace Eigen {
|
|||
* \param MatrixType the type of the object in which we are taking a sub/main/super diagonal
|
||||
* \param DiagIndex the index of the sub/super diagonal. The default is 0 and it means the main diagonal.
|
||||
* A positive value means a superdiagonal, a negative value means a subdiagonal.
|
||||
* You can also use Dynamic so the index can be set at runtime.
|
||||
* You can also use DynamicIndex so the index can be set at runtime.
|
||||
*
|
||||
* The matrix is not required to be square.
|
||||
*
|
||||
|
@ -70,7 +70,10 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
|
|||
EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
|
||||
explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index)
|
||||
{
|
||||
eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() );
|
||||
}
|
||||
|
||||
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
|
||||
|
||||
|
|
|
@ -31,7 +31,8 @@ struct dot_nocheck
|
|||
typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
|
||||
typedef typename conj_prod::result_type ResScalar;
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
|
||||
EIGEN_STRONG_INLINE
|
||||
static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
|
||||
{
|
||||
return a.template binaryExpr<conj_prod>(b).sum();
|
||||
}
|
||||
|
@ -43,7 +44,8 @@ struct dot_nocheck<T, U, true>
|
|||
typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
|
||||
typedef typename conj_prod::result_type ResScalar;
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
|
||||
EIGEN_STRONG_INLINE
|
||||
static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
|
||||
{
|
||||
return a.transpose().template binaryExpr<conj_prod>(b).sum();
|
||||
}
|
||||
|
@ -65,6 +67,7 @@ struct dot_nocheck<T, U, true>
|
|||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE
|
||||
typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
|
||||
MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
|
||||
{
|
||||
|
@ -102,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
|
|||
* \sa lpNorm(), dot(), squaredNorm()
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
|
||||
EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
|
||||
{
|
||||
return numext::sqrt(squaredNorm());
|
||||
}
|
||||
|
@ -117,7 +120,7 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real Matr
|
|||
* \sa norm(), normalize()
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline const typename MatrixBase<Derived>::PlainObject
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
|
||||
MatrixBase<Derived>::normalized() const
|
||||
{
|
||||
typedef typename internal::nested_eval<Derived,2>::type _Nested;
|
||||
|
@ -139,7 +142,7 @@ MatrixBase<Derived>::normalized() const
|
|||
* \sa norm(), normalized()
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline void MatrixBase<Derived>::normalize()
|
||||
EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
|
||||
{
|
||||
RealScalar z = squaredNorm();
|
||||
// NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
|
||||
|
@ -160,7 +163,7 @@ inline void MatrixBase<Derived>::normalize()
|
|||
* \sa stableNorm(), stableNormalize(), normalized()
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline const typename MatrixBase<Derived>::PlainObject
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
|
||||
MatrixBase<Derived>::stableNormalized() const
|
||||
{
|
||||
typedef typename internal::nested_eval<Derived,3>::type _Nested;
|
||||
|
@ -185,7 +188,7 @@ MatrixBase<Derived>::stableNormalized() const
|
|||
* \sa stableNorm(), stableNormalized(), normalize()
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline void MatrixBase<Derived>::stableNormalize()
|
||||
EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
|
||||
{
|
||||
RealScalar w = cwiseAbs().maxCoeff();
|
||||
RealScalar z = (derived()/w).squaredNorm();
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
namespace Eigen {
|
||||
|
||||
/** \class EigenBase
|
||||
* \ingroup Core_Module
|
||||
*
|
||||
* Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
|
||||
*
|
||||
|
@ -128,6 +129,7 @@ template<typename Derived> struct EigenBase
|
|||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
|
||||
{
|
||||
call_assignment(derived(), other.derived());
|
||||
|
@ -136,6 +138,7 @@ Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
|
|||
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
|
||||
{
|
||||
call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
|
@ -144,6 +147,7 @@ Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
|
|||
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
|
||||
{
|
||||
call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
|
|
|
@ -24,9 +24,14 @@ template<int Rows, int Cols, int Depth> struct product_type_selector;
|
|||
|
||||
template<int Size, int MaxSize> struct product_size_category
|
||||
{
|
||||
enum { is_large = MaxSize == Dynamic ||
|
||||
enum {
|
||||
#ifndef EIGEN_CUDA_ARCH
|
||||
is_large = MaxSize == Dynamic ||
|
||||
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
|
||||
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
|
||||
#else
|
||||
is_large = 0,
|
||||
#endif
|
||||
value = is_large ? Large
|
||||
: Size == 1 ? 1
|
||||
: Small
|
||||
|
@ -379,8 +384,6 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
|
|||
*
|
||||
* \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
|
||||
*/
|
||||
#ifndef __CUDACC__
|
||||
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
inline const Product<Derived, OtherDerived>
|
||||
|
@ -412,8 +415,6 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
|
|||
return Product<Derived, OtherDerived>(derived(), other.derived());
|
||||
}
|
||||
|
||||
#endif // __CUDACC__
|
||||
|
||||
/** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
|
||||
*
|
||||
* The returned product will behave like any other expressions: the coefficients of the product will be
|
||||
|
|
|
@ -230,7 +230,7 @@ pload1(const typename unpacket_traits<Packet>::type *a) { return pset1<Packet>(
|
|||
* duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
|
||||
* Currently, this function is only used for scalar * complex products.
|
||||
*/
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
|
||||
ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
||||
|
||||
/** \internal \returns a packet with elements of \a *from quadrupled.
|
||||
|
@ -278,7 +278,7 @@ inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
|
|||
}
|
||||
|
||||
/** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
|
||||
template<typename Packet> inline Packet
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
|
||||
plset(const typename unpacket_traits<Packet>::type& a) { return a; }
|
||||
|
||||
/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
|
||||
|
@ -351,10 +351,7 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet&
|
|||
/** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
|
||||
{
|
||||
// FIXME: uncomment the following in case we drop the internal imag and real functions.
|
||||
// using std::imag;
|
||||
// using std::real;
|
||||
return Packet(imag(a),real(a));
|
||||
return Packet(a.imag(),a.real());
|
||||
}
|
||||
|
||||
/**************************
|
||||
|
@ -482,7 +479,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro
|
|||
* by the current computation.
|
||||
*/
|
||||
template<typename Packet, int LoadMode>
|
||||
inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
|
||||
{
|
||||
return ploadt<Packet, LoadMode>(from);
|
||||
}
|
||||
|
@ -524,10 +521,10 @@ inline void palign(PacketType& first, const PacketType& second)
|
|||
#ifndef __CUDACC__
|
||||
|
||||
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
|
||||
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
|
||||
{ return std::complex<float>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }
|
||||
|
||||
template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
|
||||
{ return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
|
||||
{ return std::complex<double>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -109,20 +109,6 @@ class WithFormat
|
|||
IOFormat m_format;
|
||||
};
|
||||
|
||||
/** \returns a WithFormat proxy object allowing to print a matrix the with given
|
||||
* format \a fmt.
|
||||
*
|
||||
* See class IOFormat for some examples.
|
||||
*
|
||||
* \sa class IOFormat, class WithFormat
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline const WithFormat<Derived>
|
||||
DenseBase<Derived>::format(const IOFormat& fmt) const
|
||||
{
|
||||
return WithFormat<Derived>(derived(), fmt);
|
||||
}
|
||||
|
||||
namespace internal {
|
||||
|
||||
// NOTE: This helper is kept for backward compatibility with previous code specializing
|
||||
|
|
|
@ -20,11 +20,17 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
|
|||
{
|
||||
typedef traits<PlainObjectType> TraitsBase;
|
||||
enum {
|
||||
PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags&RowMajorBit)==RowMajorBit)
|
||||
? PlainObjectType::ColsAtCompileTime
|
||||
: PlainObjectType::RowsAtCompileTime,
|
||||
|
||||
InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
|
||||
? int(PlainObjectType::InnerStrideAtCompileTime)
|
||||
: int(StrideType::InnerStrideAtCompileTime),
|
||||
OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
|
||||
? int(PlainObjectType::OuterStrideAtCompileTime)
|
||||
? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic
|
||||
? Dynamic
|
||||
: int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
|
||||
: int(StrideType::OuterStrideAtCompileTime),
|
||||
Alignment = int(MapOptions)&int(AlignedMask),
|
||||
Flags0 = TraitsBase::Flags & (~NestByRefBit),
|
||||
|
@ -107,10 +113,11 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
|
|||
EIGEN_DEVICE_FUNC
|
||||
inline Index outerStride() const
|
||||
{
|
||||
return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
|
||||
: IsVectorAtCompileTime ? this->size()
|
||||
: int(Flags)&RowMajorBit ? this->cols()
|
||||
: this->rows();
|
||||
return int(StrideType::OuterStrideAtCompileTime) != 0 ? m_stride.outer()
|
||||
: int(internal::traits<Map>::OuterStrideAtCompileTime) != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
|
||||
: IsVectorAtCompileTime ? (this->size() * innerStride())
|
||||
: (int(Flags)&RowMajorBit) ? (this->cols() * innerStride())
|
||||
: (this->rows() * innerStride());
|
||||
}
|
||||
|
||||
/** Constructor in the fixed-size case.
|
||||
|
|
|
@ -43,6 +43,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
|
|||
enum {
|
||||
RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
|
||||
ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
|
||||
InnerStrideAtCompileTime = internal::traits<Derived>::InnerStrideAtCompileTime,
|
||||
SizeAtCompileTime = Base::SizeAtCompileTime
|
||||
};
|
||||
|
||||
|
@ -181,14 +182,19 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
|
|||
#endif
|
||||
|
||||
protected:
|
||||
EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
|
||||
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const
|
||||
{
|
||||
#if EIGEN_MAX_ALIGN_BYTES>0
|
||||
// innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value:
|
||||
const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime);
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride);
|
||||
eigen_assert(( ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0)
|
||||
|| (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
|
||||
|| (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -290,6 +296,9 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
|
|||
// In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
|
||||
// see bugs 821 and 920.
|
||||
using ReadOnlyMapBase::Base::operator=;
|
||||
protected:
|
||||
EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
|
||||
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
|
||||
};
|
||||
|
||||
#undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
|
||||
|
|
|
@ -287,7 +287,7 @@ struct abs2_impl_default<Scalar, true> // IsComplex
|
|||
EIGEN_DEVICE_FUNC
|
||||
static inline RealScalar run(const Scalar& x)
|
||||
{
|
||||
return real(x)*real(x) + imag(x)*imag(x);
|
||||
return x.real()*x.real() + x.imag()*x.imag();
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -313,14 +313,17 @@ struct abs2_retval
|
|||
****************************************************************************/
|
||||
|
||||
template<typename Scalar, bool IsComplex>
|
||||
struct norm1_default_impl
|
||||
struct norm1_default_impl;
|
||||
|
||||
template<typename Scalar>
|
||||
struct norm1_default_impl<Scalar,true>
|
||||
{
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline RealScalar run(const Scalar& x)
|
||||
{
|
||||
EIGEN_USING_STD_MATH(abs);
|
||||
return abs(real(x)) + abs(imag(x));
|
||||
return abs(x.real()) + abs(x.imag());
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -348,31 +351,7 @@ struct norm1_retval
|
|||
* Implementation of hypot *
|
||||
****************************************************************************/
|
||||
|
||||
template<typename Scalar>
|
||||
struct hypot_impl
|
||||
{
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
static inline RealScalar run(const Scalar& x, const Scalar& y)
|
||||
{
|
||||
EIGEN_USING_STD_MATH(abs);
|
||||
EIGEN_USING_STD_MATH(sqrt);
|
||||
RealScalar _x = abs(x);
|
||||
RealScalar _y = abs(y);
|
||||
Scalar p, qp;
|
||||
if(_x>_y)
|
||||
{
|
||||
p = _x;
|
||||
qp = _y / p;
|
||||
}
|
||||
else
|
||||
{
|
||||
p = _y;
|
||||
qp = _x / p;
|
||||
}
|
||||
if(p==RealScalar(0)) return RealScalar(0);
|
||||
return p * sqrt(RealScalar(1) + qp*qp);
|
||||
}
|
||||
};
|
||||
template<typename Scalar> struct hypot_impl;
|
||||
|
||||
template<typename Scalar>
|
||||
struct hypot_retval
|
||||
|
@ -495,7 +474,7 @@ namespace std_fallback {
|
|||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
EIGEN_USING_STD_MATH(log);
|
||||
Scalar x1p = RealScalar(1) + x;
|
||||
return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
|
||||
return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -641,20 +620,27 @@ struct random_default_impl<Scalar, false, true>
|
|||
{
|
||||
static inline Scalar run(const Scalar& x, const Scalar& y)
|
||||
{
|
||||
typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX;
|
||||
if(y<x)
|
||||
if (y <= x)
|
||||
return x;
|
||||
// the following difference might overflow on a 32 bits system,
|
||||
// but since y>=x the result converted to an unsigned long is still correct.
|
||||
std::size_t range = ScalarX(y)-ScalarX(x);
|
||||
std::size_t offset = 0;
|
||||
// rejection sampling
|
||||
std::size_t divisor = 1;
|
||||
std::size_t multiplier = 1;
|
||||
if(range<RAND_MAX) divisor = (std::size_t(RAND_MAX)+1)/(range+1);
|
||||
else multiplier = 1 + range/(std::size_t(RAND_MAX)+1);
|
||||
// ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
|
||||
typedef typename make_unsigned<Scalar>::type ScalarU;
|
||||
// ScalarX is the widest of ScalarU and unsigned int.
|
||||
// We'll deal only with ScalarX and unsigned int below thus avoiding signed
|
||||
// types and arithmetic and signed overflows (which are undefined behavior).
|
||||
typedef typename conditional<(ScalarU(-1) > unsigned(-1)), ScalarU, unsigned>::type ScalarX;
|
||||
// The following difference doesn't overflow, provided our integer types are two's
|
||||
// complement and have the same number of padding bits in signed and unsigned variants.
|
||||
// This is the case in most modern implementations of C++.
|
||||
ScalarX range = ScalarX(y) - ScalarX(x);
|
||||
ScalarX offset = 0;
|
||||
ScalarX divisor = 1;
|
||||
ScalarX multiplier = 1;
|
||||
const unsigned rand_max = RAND_MAX;
|
||||
if (range <= rand_max) divisor = (rand_max + 1) / (range + 1);
|
||||
else multiplier = 1 + range / (rand_max + 1);
|
||||
// Rejection sampling.
|
||||
do {
|
||||
offset = (std::size_t(std::rand()) * multiplier) / divisor;
|
||||
offset = (unsigned(std::rand()) * multiplier) / divisor;
|
||||
} while (offset > range);
|
||||
return Scalar(ScalarX(x) + offset);
|
||||
}
|
||||
|
@ -679,8 +665,8 @@ struct random_default_impl<Scalar, true, false>
|
|||
{
|
||||
static inline Scalar run(const Scalar& x, const Scalar& y)
|
||||
{
|
||||
return Scalar(random(real(x), real(y)),
|
||||
random(imag(x), imag(y)));
|
||||
return Scalar(random(x.real(), y.real()),
|
||||
random(x.imag(), y.imag()));
|
||||
}
|
||||
static inline Scalar run()
|
||||
{
|
||||
|
@ -933,6 +919,9 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
|
|||
return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline bool abs2(bool x) { return x; }
|
||||
|
||||
template<typename Scalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
|
||||
|
@ -1030,7 +1019,8 @@ inline int log2(int x)
|
|||
|
||||
/** \returns the square root of \a x.
|
||||
*
|
||||
* It is essentially equivalent to \code using std::sqrt; return sqrt(x); \endcode,
|
||||
* It is essentially equivalent to
|
||||
* \code using std::sqrt; return sqrt(x); \endcode
|
||||
* but slightly faster for float/double and some compilers (e.g., gcc), thanks to
|
||||
* specializations when SSE is enabled.
|
||||
*
|
||||
|
@ -1061,11 +1051,24 @@ double log(const double &x) { return ::log(x); }
|
|||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
typename NumTraits<T>::Real abs(const T &x) {
|
||||
typename internal::enable_if<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex,typename NumTraits<T>::Real>::type
|
||||
abs(const T &x) {
|
||||
EIGEN_USING_STD_MATH(abs);
|
||||
return abs(x);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
typename internal::enable_if<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex),typename NumTraits<T>::Real>::type
|
||||
abs(const T &x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); }
|
||||
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float abs(const float &x) { return ::fabsf(x); }
|
||||
|
|
|
@ -71,6 +71,29 @@ T generic_fast_tanh_float(const T& a_x)
|
|||
return pdiv(p, q);
|
||||
}
|
||||
|
||||
template<typename RealScalar>
|
||||
EIGEN_STRONG_INLINE
|
||||
RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
|
||||
{
|
||||
EIGEN_USING_STD_MATH(sqrt);
|
||||
RealScalar p, qp;
|
||||
p = numext::maxi(x,y);
|
||||
if(p==RealScalar(0)) return RealScalar(0);
|
||||
qp = numext::mini(y,x) / p;
|
||||
return p * sqrt(RealScalar(1) + qp*qp);
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
struct hypot_impl
|
||||
{
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
static inline RealScalar run(const Scalar& x, const Scalar& y)
|
||||
{
|
||||
EIGEN_USING_STD_MATH(abs);
|
||||
return positive_real_hypot<RealScalar>(abs(x), abs(y));
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
|
|
@ -274,8 +274,6 @@ class Matrix
|
|||
: Base(std::move(other))
|
||||
{
|
||||
Base::_check_template_params();
|
||||
if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
|
||||
Base::_set_noalias(other);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
|
||||
|
|
|
@ -160,20 +160,11 @@ template<typename Derived> class MatrixBase
|
|||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Derived& operator-=(const MatrixBase<OtherDerived>& other);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
const Product<Derived,OtherDerived,LazyProduct>
|
||||
operator*(const MatrixBase<OtherDerived> &other) const
|
||||
{ return this->lazyProduct(other); }
|
||||
#else
|
||||
|
||||
template<typename OtherDerived>
|
||||
const Product<Derived,OtherDerived>
|
||||
operator*(const MatrixBase<OtherDerived> &other) const;
|
||||
|
||||
#endif
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
const Product<Derived,OtherDerived,LazyProduct>
|
||||
|
@ -294,7 +285,7 @@ template<typename Derived> class MatrixBase
|
|||
* fuzzy comparison such as isApprox()
|
||||
* \sa isApprox(), operator!= */
|
||||
template<typename OtherDerived>
|
||||
inline bool operator==(const MatrixBase<OtherDerived>& other) const
|
||||
EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase<OtherDerived>& other) const
|
||||
{ return cwiseEqual(other).all(); }
|
||||
|
||||
/** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
|
||||
|
@ -302,7 +293,7 @@ template<typename Derived> class MatrixBase
|
|||
* fuzzy comparison such as isApprox()
|
||||
* \sa isApprox(), operator== */
|
||||
template<typename OtherDerived>
|
||||
inline bool operator!=(const MatrixBase<OtherDerived>& other) const
|
||||
EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
|
||||
{ return cwiseNotEqual(other).any(); }
|
||||
|
||||
NoAlias<Derived,Eigen::MatrixBase > noalias();
|
||||
|
@ -453,19 +444,28 @@ template<typename Derived> class MatrixBase
|
|||
///////// MatrixFunctions module /////////
|
||||
|
||||
typedef typename internal::stem_function<Scalar>::type StemFunction;
|
||||
const MatrixExponentialReturnValue<Derived> exp() const;
|
||||
#define EIGEN_MATRIX_FUNCTION(ReturnType, Name, Description) \
|
||||
/** \returns an expression of the matrix Description of \c *this. \brief This function requires the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>. To compute the coefficient-wise Description use ArrayBase::##Name . */ \
|
||||
const ReturnType<Derived> Name() const;
|
||||
#define EIGEN_MATRIX_FUNCTION_1(ReturnType, Name, Description, Argument) \
|
||||
/** \returns an expression of the matrix Description of \c *this. \brief This function requires the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>. To compute the coefficient-wise Description use ArrayBase::##Name . */ \
|
||||
const ReturnType<Derived> Name(Argument) const;
|
||||
|
||||
EIGEN_MATRIX_FUNCTION(MatrixExponentialReturnValue, exp, exponential)
|
||||
/** \brief Helper function for the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>.*/
|
||||
const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const;
|
||||
const MatrixFunctionReturnValue<Derived> cosh() const;
|
||||
const MatrixFunctionReturnValue<Derived> sinh() const;
|
||||
const MatrixFunctionReturnValue<Derived> cos() const;
|
||||
const MatrixFunctionReturnValue<Derived> sin() const;
|
||||
const MatrixSquareRootReturnValue<Derived> sqrt() const;
|
||||
const MatrixLogarithmReturnValue<Derived> log() const;
|
||||
const MatrixPowerReturnValue<Derived> pow(const RealScalar& p) const;
|
||||
const MatrixComplexPowerReturnValue<Derived> pow(const std::complex<RealScalar>& p) const;
|
||||
EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine)
|
||||
EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine)
|
||||
EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine)
|
||||
EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine)
|
||||
EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root)
|
||||
EIGEN_MATRIX_FUNCTION(MatrixLogarithmReturnValue, log, logarithm)
|
||||
EIGEN_MATRIX_FUNCTION_1(MatrixPowerReturnValue, pow, power to \c p, const RealScalar& p)
|
||||
EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const std::complex<RealScalar>& p)
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC MatrixBase() : Base() {}
|
||||
EIGEN_DEFAULT_COPY_CONSTRUCTOR(MatrixBase)
|
||||
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MatrixBase)
|
||||
|
||||
private:
|
||||
EIGEN_DEVICE_FUNC explicit MatrixBase(int);
|
||||
|
|
|
@ -215,6 +215,8 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
|
|||
static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
|
||||
|
||||
static inline int digits10() { return NumTraits<Scalar>::digits10(); }
|
||||
};
|
||||
|
||||
template<> struct NumTraits<std::string>
|
||||
|
|
|
@ -87,17 +87,6 @@ class PermutationBase : public EigenBase<Derived>
|
|||
return derived();
|
||||
}
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
/** This is a special case of the templated operator=. Its purpose is to
|
||||
* prevent a default operator= from hiding the templated operator=.
|
||||
*/
|
||||
Derived& operator=(const PermutationBase& other)
|
||||
{
|
||||
indices() = other.indices();
|
||||
return derived();
|
||||
}
|
||||
#endif
|
||||
|
||||
/** \returns the number of rows */
|
||||
inline Index rows() const { return Index(indices().size()); }
|
||||
|
||||
|
@ -333,12 +322,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
|
|||
inline PermutationMatrix(const PermutationBase<OtherDerived>& other)
|
||||
: m_indices(other.indices()) {}
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
/** Standard copy constructor. Defined only to prevent a default copy constructor
|
||||
* from hiding the other templated constructor */
|
||||
inline PermutationMatrix(const PermutationMatrix& other) : m_indices(other.indices()) {}
|
||||
#endif
|
||||
|
||||
/** Generic constructor from expression of the indices. The indices
|
||||
* array has the meaning that the permutations sends each integer i to indices[i].
|
||||
*
|
||||
|
@ -373,17 +356,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
|
|||
return Base::operator=(tr.derived());
|
||||
}
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
/** This is a special case of the templated operator=. Its purpose is to
|
||||
* prevent a default operator= from hiding the templated operator=.
|
||||
*/
|
||||
PermutationMatrix& operator=(const PermutationMatrix& other)
|
||||
{
|
||||
m_indices = other.m_indices;
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
/** const version of indices(). */
|
||||
const IndicesType& indices() const { return m_indices; }
|
||||
/** \returns a reference to the stored array representing the permutation. */
|
||||
|
|
|
@ -41,7 +41,7 @@ template<> struct check_rows_cols_for_overflow<Dynamic> {
|
|||
{
|
||||
// http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242
|
||||
// we assume Index is signed
|
||||
Index max_index = (size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
|
||||
Index max_index = (std::size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
|
||||
bool error = (rows == 0 || cols == 0) ? false
|
||||
: (rows > max_index / cols);
|
||||
if (error)
|
||||
|
@ -577,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
|
|||
* while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
|
||||
* \a data pointers.
|
||||
*
|
||||
* Here is an example using strides:
|
||||
* \include Matrix_Map_stride.cpp
|
||||
* Output: \verbinclude Matrix_Map_stride.out
|
||||
*
|
||||
* \see class Map
|
||||
*/
|
||||
//@{
|
||||
|
@ -733,8 +737,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
|
|||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
|
||||
bool(NumTraits<T1>::IsInteger),
|
||||
const bool t0_is_integer_alike = internal::is_valid_index_type<T0>::value;
|
||||
const bool t1_is_integer_alike = internal::is_valid_index_type<T1>::value;
|
||||
EIGEN_STATIC_ASSERT(t0_is_integer_alike &&
|
||||
t1_is_integer_alike,
|
||||
FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
|
||||
resize(rows,cols);
|
||||
}
|
||||
|
@ -769,9 +775,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
|
|||
&& ((!internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0)
|
||||
{
|
||||
// NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
|
||||
const bool is_integer = NumTraits<T>::IsInteger;
|
||||
EIGEN_UNUSED_VARIABLE(is_integer);
|
||||
EIGEN_STATIC_ASSERT(is_integer,
|
||||
const bool is_integer_alike = internal::is_valid_index_type<T>::value;
|
||||
EIGEN_UNUSED_VARIABLE(is_integer_alike);
|
||||
EIGEN_STATIC_ASSERT(is_integer_alike,
|
||||
FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
|
||||
resize(size);
|
||||
}
|
||||
|
@ -812,6 +818,13 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
|
|||
this->_set_noalias(other);
|
||||
}
|
||||
|
||||
// Initialize an arbitrary matrix from an object convertible to the Derived type.
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE void _init1(const Derived& other){
|
||||
this->_set_noalias(other);
|
||||
}
|
||||
|
||||
// Initialize an arbitrary matrix from a generic Eigen expression
|
||||
template<typename T, typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
|
@ -834,7 +847,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
|
|||
this->derived() = r;
|
||||
}
|
||||
|
||||
// For fixed -size arrays:
|
||||
// For fixed-size Array<Scalar,...>
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE void _init1(const Scalar& val0,
|
||||
|
@ -846,6 +859,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
|
|||
Base::setConstant(val0);
|
||||
}
|
||||
|
||||
// For fixed-size Array<Index,...>
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE void _init1(const Index& val0,
|
||||
|
|
|
@ -97,8 +97,8 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
|
|||
&& "if you wanted a coeff-wise or a dot product use the respective explicit functions");
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
|
||||
EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
|
||||
EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
|
||||
|
@ -127,7 +127,7 @@ public:
|
|||
using Base::derived;
|
||||
typedef typename Base::Scalar Scalar;
|
||||
|
||||
operator const Scalar() const
|
||||
EIGEN_STRONG_INLINE operator const Scalar() const
|
||||
{
|
||||
return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
|
||||
}
|
||||
|
@ -162,7 +162,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
|
|||
|
||||
public:
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
|
||||
eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
|
||||
|
@ -170,7 +170,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
|
|||
return internal::evaluator<Derived>(derived()).coeff(row,col);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar coeff(Index i) const
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
|
||||
eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
|
||||
|
|
|
@ -32,7 +32,7 @@ struct evaluator<Product<Lhs, Rhs, Options> >
|
|||
typedef Product<Lhs, Rhs, Options> XprType;
|
||||
typedef product_evaluator<XprType> Base;
|
||||
|
||||
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
|
||||
};
|
||||
|
||||
// Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
|
||||
|
@ -55,7 +55,7 @@ struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
|
|||
const Product<Lhs, Rhs, DefaultProduct> > XprType;
|
||||
typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;
|
||||
|
||||
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
|
||||
: Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
|
||||
{}
|
||||
};
|
||||
|
@ -68,7 +68,7 @@ struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
|
|||
typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
|
||||
typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
|
||||
|
||||
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
|
||||
: Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
|
||||
Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
|
||||
xpr.index() ))
|
||||
|
@ -207,6 +207,12 @@ struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename
|
|||
static const bool value = true;
|
||||
};
|
||||
|
||||
template<typename OtherXpr, typename Lhs, typename Rhs>
|
||||
struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_difference_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
|
||||
const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
|
||||
static const bool value = true;
|
||||
};
|
||||
|
||||
template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
|
||||
struct assignment_from_xpr_op_product
|
||||
{
|
||||
|
@ -240,19 +246,19 @@ template<typename Lhs, typename Rhs>
|
|||
struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
|
||||
{
|
||||
template<typename Dst>
|
||||
static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
|
||||
}
|
||||
|
||||
template<typename Dst>
|
||||
static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
|
||||
}
|
||||
|
||||
template<typename Dst>
|
||||
static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{ dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
|
||||
};
|
||||
|
||||
|
@ -306,25 +312,25 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
|
|||
};
|
||||
|
||||
template<typename Dst>
|
||||
static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
|
||||
}
|
||||
|
||||
template<typename Dst>
|
||||
static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
|
||||
}
|
||||
|
||||
template<typename Dst>
|
||||
static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
|
||||
}
|
||||
|
||||
template<typename Dst>
|
||||
static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
{
|
||||
internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
|
||||
}
|
||||
|
@ -405,6 +411,32 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
|
|||
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
|
||||
}
|
||||
|
||||
// Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
|
||||
// dst {,+,-}= s * (A.lazyProduct(B))
|
||||
// This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
|
||||
// For them, this strategy is also faster than simply by-passing the heap allocation through
|
||||
// stack allocation.
|
||||
// For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
|
||||
// and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
|
||||
// that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
|
||||
template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
|
||||
const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
|
||||
{
|
||||
call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
|
||||
}
|
||||
|
||||
// Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
|
||||
// overload more specialized.
|
||||
template<typename Dst, typename LhsT, typename Func>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
|
||||
{
|
||||
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
|
||||
}
|
||||
|
||||
|
||||
// template<typename Dst>
|
||||
// static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
// { dst.noalias() += alpha * lhs.lazyProduct(rhs); }
|
||||
|
@ -779,7 +811,11 @@ public:
|
|||
_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
|
||||
_LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
|
||||
Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
|
||||
Alignment = evaluator<MatrixType>::Alignment
|
||||
Alignment = evaluator<MatrixType>::Alignment,
|
||||
|
||||
AsScalarProduct = (DiagonalType::SizeAtCompileTime==1)
|
||||
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
|
||||
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
|
||||
};
|
||||
|
||||
diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
|
||||
|
@ -791,6 +827,9 @@ public:
|
|||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
|
||||
{
|
||||
if(AsScalarProduct)
|
||||
return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
|
||||
else
|
||||
return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
|
||||
}
|
||||
|
||||
|
|
|
@ -407,7 +407,7 @@ protected:
|
|||
*/
|
||||
template<typename Derived>
|
||||
template<typename Func>
|
||||
typename internal::traits<Derived>::Scalar
|
||||
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
DenseBase<Derived>::redux(const Func& func) const
|
||||
{
|
||||
eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
|
||||
|
|
|
@ -28,12 +28,13 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
|
|||
|
||||
template<typename Derived> struct match {
|
||||
enum {
|
||||
IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime,
|
||||
HasDirectAccess = internal::has_direct_access<Derived>::ret,
|
||||
StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
|
||||
StorageOrderMatch = IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
|
||||
InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
|
||||
|| int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
|
||||
|| (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
|
||||
OuterStrideMatch = Derived::IsVectorAtCompileTime
|
||||
OuterStrideMatch = IsVectorAtCompileTime
|
||||
|| int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
|
||||
// NOTE, this indirection of evaluator<Derived>::Alignment is needed
|
||||
// to workaround a very strange bug in MSVC related to the instantiation
|
||||
|
@ -95,6 +96,8 @@ protected:
|
|||
template<typename Expression>
|
||||
EIGEN_DEVICE_FUNC void construct(Expression& expr)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression);
|
||||
|
||||
if(PlainObjectType::RowsAtCompileTime==1)
|
||||
{
|
||||
eigen_assert(expr.rows()==1 || expr.cols()==1);
|
||||
|
|
|
@ -71,7 +71,9 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
|
|||
|
||||
EIGEN_DEVICE_FUNC
|
||||
explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
|
||||
{}
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline Index rows() const { return m_matrix.rows(); }
|
||||
|
@ -189,7 +191,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
|
|||
TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
|
||||
}
|
||||
|
||||
typedef SelfAdjointView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
|
||||
typedef SelfAdjointView<const MatrixConjugateReturnType,UpLo> ConjugateReturnType;
|
||||
/** \sa MatrixBase::conjugate() const */
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const ConjugateReturnType conjugate() const
|
||||
|
|
|
@ -15,33 +15,29 @@ namespace Eigen {
|
|||
// TODO generalize the scalar type of 'other'
|
||||
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
|
||||
{
|
||||
typedef typename Derived::PlainObject PlainObject;
|
||||
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
|
||||
return derived();
|
||||
}
|
||||
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
|
||||
{
|
||||
typedef typename Derived::PlainObject PlainObject;
|
||||
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
|
||||
return derived();
|
||||
}
|
||||
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
|
||||
{
|
||||
typedef typename Derived::PlainObject PlainObject;
|
||||
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
|
||||
return derived();
|
||||
}
|
||||
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
|
||||
{
|
||||
typedef typename Derived::PlainObject PlainObject;
|
||||
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
|
||||
return derived();
|
||||
}
|
||||
|
|
|
@ -34,12 +34,12 @@ template<typename Decomposition, typename RhsType,typename StorageKind> struct s
|
|||
template<typename Decomposition, typename RhsType>
|
||||
struct solve_traits<Decomposition,RhsType,Dense>
|
||||
{
|
||||
typedef Matrix<typename RhsType::Scalar,
|
||||
typedef typename make_proper_matrix_type<typename RhsType::Scalar,
|
||||
Decomposition::ColsAtCompileTime,
|
||||
RhsType::ColsAtCompileTime,
|
||||
RhsType::PlainObject::Options,
|
||||
Decomposition::MaxColsAtCompileTime,
|
||||
RhsType::MaxColsAtCompileTime> PlainObject;
|
||||
RhsType::MaxColsAtCompileTime>::type PlainObject;
|
||||
};
|
||||
|
||||
template<typename Decomposition, typename RhsType>
|
||||
|
|
|
@ -19,7 +19,7 @@ namespace internal {
|
|||
template<typename LhsScalar, typename RhsScalar, typename Index, int Side, int Mode, bool Conjugate, int StorageOrder>
|
||||
struct triangular_solve_vector;
|
||||
|
||||
template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder>
|
||||
template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder, int OtherInnerStride>
|
||||
struct triangular_solve_matrix;
|
||||
|
||||
// small helper struct extracting some traits on the underlying solver operation
|
||||
|
@ -98,8 +98,8 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
|
|||
BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
|
||||
|
||||
triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
|
||||
(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
|
||||
::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.outerStride(), blocking);
|
||||
(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor, Rhs::InnerStrideAtCompileTime>
|
||||
::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.innerStride(), rhs.outerStride(), blocking);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -169,6 +169,9 @@ void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<Ot
|
|||
OtherDerived& other = _other.const_cast_derived();
|
||||
eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
|
||||
eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
|
||||
// If solving for a 0x0 matrix, nothing to do, simply return.
|
||||
if (derived().cols() == 0)
|
||||
return;
|
||||
|
||||
enum { copy = (internal::traits<OtherDerived>::Flags & RowMajorBit) && OtherDerived::IsVectorAtCompileTime && OtherDerived::SizeAtCompileTime!=1};
|
||||
typedef typename internal::conditional<copy,
|
||||
|
|
|
@ -165,12 +165,13 @@ MatrixBase<Derived>::stableNorm() const
|
|||
|
||||
typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
|
||||
typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
|
||||
DerivedCopy copy(derived());
|
||||
const DerivedCopy copy(derived());
|
||||
|
||||
enum {
|
||||
CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit)
|
||||
|| (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
|
||||
) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
|
||||
) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
|
||||
&& (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
|
||||
};
|
||||
typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
|
||||
typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
|
||||
|
|
|
@ -146,6 +146,8 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
|
|||
{
|
||||
return derived().nestedExpression().coeffRef(index);
|
||||
}
|
||||
protected:
|
||||
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TransposeImpl)
|
||||
};
|
||||
|
||||
/** \returns an expression of the transpose of *this.
|
||||
|
|
|
@ -34,17 +34,6 @@ class TranspositionsBase
|
|||
return derived();
|
||||
}
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
/** This is a special case of the templated operator=. Its purpose is to
|
||||
* prevent a default operator= from hiding the templated operator=.
|
||||
*/
|
||||
Derived& operator=(const TranspositionsBase& other)
|
||||
{
|
||||
indices() = other.indices();
|
||||
return derived();
|
||||
}
|
||||
#endif
|
||||
|
||||
/** \returns the number of transpositions */
|
||||
Index size() const { return indices().size(); }
|
||||
/** \returns the number of rows of the equivalent permutation matrix */
|
||||
|
@ -171,12 +160,6 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
|
|||
inline Transpositions(const TranspositionsBase<OtherDerived>& other)
|
||||
: m_indices(other.indices()) {}
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
/** Standard copy constructor. Defined only to prevent a default copy constructor
|
||||
* from hiding the other templated constructor */
|
||||
inline Transpositions(const Transpositions& other) : m_indices(other.indices()) {}
|
||||
#endif
|
||||
|
||||
/** Generic constructor from expression of the transposition indices. */
|
||||
template<typename Other>
|
||||
explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices)
|
||||
|
@ -189,17 +172,6 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
|
|||
return Base::operator=(other);
|
||||
}
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
/** This is a special case of the templated operator=. Its purpose is to
|
||||
* prevent a default operator= from hiding the templated operator=.
|
||||
*/
|
||||
Transpositions& operator=(const Transpositions& other)
|
||||
{
|
||||
m_indices = other.m_indices;
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
/** Constructs an uninitialized permutation matrix of given size.
|
||||
*/
|
||||
inline Transpositions(Index size) : m_indices(size)
|
||||
|
@ -306,17 +278,6 @@ class TranspositionsWrapper
|
|||
return Base::operator=(other);
|
||||
}
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
/** This is a special case of the templated operator=. Its purpose is to
|
||||
* prevent a default operator= from hiding the templated operator=.
|
||||
*/
|
||||
TranspositionsWrapper& operator=(const TranspositionsWrapper& other)
|
||||
{
|
||||
m_indices = other.m_indices;
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
/** const version of indices(). */
|
||||
const IndicesType& indices() const { return m_indices; }
|
||||
|
||||
|
@ -384,7 +345,7 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
|
|||
const Product<OtherDerived, Transpose, AliasFreeProduct>
|
||||
operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt)
|
||||
{
|
||||
return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt.derived());
|
||||
return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt);
|
||||
}
|
||||
|
||||
/** \returns the \a matrix with the inverse transpositions applied to the rows.
|
||||
|
|
|
@ -217,9 +217,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
|
|||
explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
|
||||
{}
|
||||
|
||||
using Base::operator=;
|
||||
TriangularView& operator=(const TriangularView &other)
|
||||
{ return Base::operator=(other); }
|
||||
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView)
|
||||
|
||||
/** \copydoc EigenBase::rows() */
|
||||
EIGEN_DEVICE_FUNC
|
||||
|
@ -544,6 +542,10 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
|
|||
template<typename ProductType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta);
|
||||
protected:
|
||||
EIGEN_DEFAULT_COPY_CONSTRUCTOR(TriangularViewImpl)
|
||||
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TriangularViewImpl)
|
||||
|
||||
};
|
||||
|
||||
/***************************************************************************
|
||||
|
|
|
@ -204,23 +204,7 @@ template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
|
|||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet8f, Packet4cf, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
|
||||
{ return Packet4cf(Eigen::internal::pmul(x, y.v)); }
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet4cf, Packet8f, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
|
||||
{ return Packet4cf(Eigen::internal::pmul(x.v, y)); }
|
||||
};
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
|
||||
{
|
||||
|
@ -400,23 +384,7 @@ template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
|
|||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet4d, Packet2cd, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
|
||||
{ return Packet2cd(Eigen::internal::pmul(x, y.v)); }
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet2cd, Packet4d, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
|
||||
{ return Packet2cd(Eigen::internal::pmul(x.v, y)); }
|
||||
};
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
|
||||
{
|
||||
|
|
|
@ -159,11 +159,12 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co
|
|||
|
||||
#ifdef __FMA__
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
|
||||
#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
|
||||
// clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
|
||||
// and gcc stupidly generates a vfmadd132ps instruction,
|
||||
// so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
|
||||
// the result of the product.
|
||||
#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
|
||||
// Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
|
||||
// and even register spilling with clang>=6.0 (bug 1637).
|
||||
// Gcc stupidly generates a vfmadd132ps instruction.
|
||||
// So let's enforce it to generate a vfmadd231ps instruction since the most common use
|
||||
// case is to accumulate the result of the product.
|
||||
Packet8f res = c;
|
||||
__asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
|
||||
return res;
|
||||
|
@ -172,7 +173,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f&
|
|||
#endif
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
|
||||
#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
|
||||
#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
|
||||
// see above
|
||||
Packet4d res = c;
|
||||
__asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
|
||||
|
@ -308,9 +309,9 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
|
|||
}
|
||||
|
||||
#ifndef EIGEN_VECTORIZE_AVX512
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
#endif
|
||||
|
||||
template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
|
||||
|
@ -333,9 +334,12 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
|
|||
{
|
||||
__m256d tmp = _mm256_shuffle_pd(a,a,5);
|
||||
return _mm256_permute2f128_pd(tmp, tmp, 1);
|
||||
|
||||
#if 0
|
||||
// This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
|
||||
// exhibit the same latency/throughput, but it is here for future reference/benchmarking...
|
||||
__m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
|
||||
return _mm256_permute_pd(swap_halves,5);
|
||||
#endif
|
||||
}
|
||||
|
||||
// pabs should be ok
|
||||
|
|
|
@ -29,6 +29,7 @@ namespace internal {
|
|||
#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
|
||||
const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
|
||||
|
||||
|
||||
// Natural logarithm
|
||||
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
|
||||
// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
|
||||
|
@ -47,6 +48,7 @@ plog<Packet16f>(const Packet16f& _x) {
|
|||
// The smallest non denormalized float number.
|
||||
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
|
||||
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
|
||||
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
|
||||
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
|
||||
|
||||
// Polynomial coefficients.
|
||||
|
@ -64,10 +66,8 @@ plog<Packet16f>(const Packet16f& _x) {
|
|||
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);
|
||||
|
||||
// invalid_mask is set to true when x is NaN
|
||||
__mmask16 invalid_mask =
|
||||
_mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
|
||||
__mmask16 iszero_mask =
|
||||
_mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ);
|
||||
__mmask16 invalid_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
|
||||
__mmask16 iszero_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ);
|
||||
|
||||
// Truncate input values to the minimum positive normal.
|
||||
x = pmax(x, p16f_min_norm_pos);
|
||||
|
@ -88,9 +88,9 @@ plog<Packet16f>(const Packet16f& _x) {
|
|||
// x = x + x - 1.0;
|
||||
// } else { x = x - 1.0; }
|
||||
__mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
|
||||
Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps());
|
||||
Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x);
|
||||
x = psub(x, p16f_1);
|
||||
e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps()));
|
||||
e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1));
|
||||
x = padd(x, tmp);
|
||||
|
||||
Packet16f x2 = pmul(x, x);
|
||||
|
@ -118,10 +118,18 @@ plog<Packet16f>(const Packet16f& _x) {
|
|||
x = padd(x, y);
|
||||
x = padd(x, y2);
|
||||
|
||||
// Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
|
||||
return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf,
|
||||
_mm512_mask_blend_ps(invalid_mask, p16f_nan, x));
|
||||
__mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
|
||||
// Filter out invalid inputs, i.e.:
|
||||
// - negative arg will be NAN,
|
||||
// - 0 will be -INF.
|
||||
// - +INF will be +INF
|
||||
return _mm512_mask_blend_ps(iszero_mask,
|
||||
_mm512_mask_blend_ps(invalid_mask,
|
||||
_mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
|
||||
p16f_nan),
|
||||
p16f_minus_inf);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Exponential function. Works by writing "x = m*log(2) + r" where
|
||||
|
@ -257,50 +265,39 @@ pexp<Packet8d>(const Packet8d& _x) {
|
|||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
|
||||
psqrt<Packet16f>(const Packet16f& _x) {
|
||||
_EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
|
||||
_EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
|
||||
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
|
||||
Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
|
||||
__mmask16 denormal_mask = _mm512_kand(
|
||||
_mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
|
||||
_CMP_LT_OQ),
|
||||
_mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
|
||||
|
||||
Packet16f neg_half = pmul(_x, p16f_minus_half);
|
||||
|
||||
// select only the inverse sqrt of positive normal inputs (denormals are
|
||||
// flushed to zero and cause infs as well).
|
||||
__mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
|
||||
Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x),
|
||||
_mm512_setzero_ps());
|
||||
Packet16f x = _mm512_rsqrt14_ps(_x);
|
||||
|
||||
// Do a single step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
|
||||
|
||||
// Multiply the original _x by it's reciprocal square root to extract the
|
||||
// square root.
|
||||
return pmul(_x, x);
|
||||
// Flush results for denormals to zero.
|
||||
return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
|
||||
psqrt<Packet8d>(const Packet8d& _x) {
|
||||
_EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
|
||||
_EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
|
||||
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
|
||||
Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5));
|
||||
__mmask16 denormal_mask = _mm512_kand(
|
||||
_mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
|
||||
_CMP_LT_OQ),
|
||||
_mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
|
||||
|
||||
Packet8d neg_half = pmul(_x, p8d_minus_half);
|
||||
Packet8d x = _mm512_rsqrt14_pd(_x);
|
||||
|
||||
// select only the inverse sqrt of positive normal inputs (denormals are
|
||||
// flushed to zero and cause infs as well).
|
||||
__mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
|
||||
Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x),
|
||||
_mm512_setzero_pd());
|
||||
|
||||
// Do a first step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
|
||||
// Do a single step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
|
||||
|
||||
// Do a second step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
|
||||
|
||||
// Multiply the original _x by it's reciprocal square root to extract the
|
||||
// square root.
|
||||
return pmul(_x, x);
|
||||
return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
|
@ -333,20 +330,18 @@ prsqrt<Packet16f>(const Packet16f& _x) {
|
|||
// select only the inverse sqrt of positive normal inputs (denormals are
|
||||
// flushed to zero and cause infs as well).
|
||||
__mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
|
||||
Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(),
|
||||
_mm512_rsqrt14_ps(_x));
|
||||
Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
|
||||
|
||||
// Fill in NaNs and Infs for the negative/zero entries.
|
||||
__mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
|
||||
Packet16f infs_and_nans = _mm512_mask_blend_ps(
|
||||
neg_mask, p16f_nan,
|
||||
_mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps()));
|
||||
neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
|
||||
|
||||
// Do a single step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
|
||||
|
||||
// Insert NaNs and Infs in all the right places.
|
||||
return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x);
|
||||
return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
|
||||
}
|
||||
|
||||
template <>
|
||||
|
@ -363,14 +358,12 @@ prsqrt<Packet8d>(const Packet8d& _x) {
|
|||
// select only the inverse sqrt of positive normal inputs (denormals are
|
||||
// flushed to zero and cause infs as well).
|
||||
__mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
|
||||
Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(),
|
||||
_mm512_rsqrt14_pd(_x));
|
||||
Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
|
||||
|
||||
// Fill in NaNs and Infs for the negative/zero entries.
|
||||
__mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
|
||||
Packet8d infs_and_nans = _mm512_mask_blend_pd(
|
||||
neg_mask, p8d_nan,
|
||||
_mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd()));
|
||||
neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
|
||||
|
||||
// Do a first step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
|
||||
|
@ -379,9 +372,9 @@ prsqrt<Packet8d>(const Packet8d& _x) {
|
|||
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
|
||||
|
||||
// Insert NaNs and Infs in all the right places.
|
||||
return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x);
|
||||
return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
|
||||
}
|
||||
#else
|
||||
#elif defined(EIGEN_VECTORIZE_AVX512ER)
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
|
||||
return _mm512_rsqrt28_ps(x);
|
||||
|
|
|
@ -19,10 +19,10 @@ namespace internal {
|
|||
#endif
|
||||
|
||||
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
||||
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
|
||||
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
|
||||
#endif
|
||||
|
||||
#ifdef __FMA__
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
||||
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
||||
#endif
|
||||
|
@ -54,13 +54,14 @@ template<> struct packet_traits<float> : default_packet_traits
|
|||
AlignedOnScalar = 1,
|
||||
size = 16,
|
||||
HasHalfPacket = 1,
|
||||
#if EIGEN_GNUC_AT_LEAST(5, 3)
|
||||
HasBlend = 0,
|
||||
#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
HasLog = 1,
|
||||
#endif
|
||||
HasExp = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasSqrt = EIGEN_FAST_MATH,
|
||||
HasRsqrt = EIGEN_FAST_MATH,
|
||||
#endif
|
||||
HasDiv = 1
|
||||
};
|
||||
|
@ -74,8 +75,8 @@ template<> struct packet_traits<double> : default_packet_traits
|
|||
AlignedOnScalar = 1,
|
||||
size = 8,
|
||||
HasHalfPacket = 1,
|
||||
#if EIGEN_GNUC_AT_LEAST(5, 3)
|
||||
HasSqrt = 1,
|
||||
#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
|
||||
HasSqrt = EIGEN_FAST_MATH,
|
||||
HasRsqrt = EIGEN_FAST_MATH,
|
||||
#endif
|
||||
HasDiv = 1
|
||||
|
@ -98,6 +99,7 @@ template <>
|
|||
struct unpacket_traits<Packet16f> {
|
||||
typedef float type;
|
||||
typedef Packet8f half;
|
||||
typedef Packet16i integer_packet;
|
||||
enum { size = 16, alignment=Aligned64 };
|
||||
};
|
||||
template <>
|
||||
|
@ -132,7 +134,7 @@ EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
|
|||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
|
||||
return _mm512_broadcastsd_pd(_mm_load_pd1(from));
|
||||
return _mm512_set1_pd(*from);
|
||||
}
|
||||
|
||||
template <>
|
||||
|
@ -158,6 +160,11 @@ EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
|
|||
const Packet8d& b) {
|
||||
return _mm512_add_pd(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a,
|
||||
const Packet16i& b) {
|
||||
return _mm512_add_epi32(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a,
|
||||
|
@ -169,6 +176,11 @@ EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a,
|
|||
const Packet8d& b) {
|
||||
return _mm512_sub_pd(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a,
|
||||
const Packet16i& b) {
|
||||
return _mm512_sub_epi32(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
|
||||
|
@ -202,6 +214,11 @@ EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a,
|
|||
const Packet8d& b) {
|
||||
return _mm512_mul_pd(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a,
|
||||
const Packet16i& b) {
|
||||
return _mm512_mul_epi32(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
|
||||
|
@ -214,7 +231,7 @@ EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
|
|||
return _mm512_div_pd(a, b);
|
||||
}
|
||||
|
||||
#ifdef __FMA__
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
|
||||
const Packet16f& c) {
|
||||
|
@ -230,23 +247,73 @@ EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b,
|
|||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a,
|
||||
const Packet16f& b) {
|
||||
return _mm512_min_ps(a, b);
|
||||
// Arguments are reversed to match NaN propagation behavior of std::min.
|
||||
return _mm512_min_ps(b, a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a,
|
||||
const Packet8d& b) {
|
||||
return _mm512_min_pd(a, b);
|
||||
// Arguments are reversed to match NaN propagation behavior of std::min.
|
||||
return _mm512_min_pd(b, a);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a,
|
||||
const Packet16f& b) {
|
||||
return _mm512_max_ps(a, b);
|
||||
// Arguments are reversed to match NaN propagation behavior of std::max.
|
||||
return _mm512_max_ps(b, a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
|
||||
const Packet8d& b) {
|
||||
return _mm512_max_pd(a, b);
|
||||
// Arguments are reversed to match NaN propagation behavior of std::max.
|
||||
return _mm512_max_pd(b, a);
|
||||
}
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
|
||||
template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
|
||||
EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); }
|
||||
#else
|
||||
// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
|
||||
template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
|
||||
return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_));
|
||||
}
|
||||
|
||||
// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
|
||||
template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
|
||||
return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_));
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
|
||||
return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)),
|
||||
_mm256_castps_si256(b),1));
|
||||
}
|
||||
#endif
|
||||
|
||||
// Helper function for bit packing snippet of low precision comparison.
|
||||
// It packs the flags from 32x16 to 16x16.
|
||||
EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) {
|
||||
// Split data into small pieces and handle with AVX instructions
|
||||
// to guarantee internal order of vector.
|
||||
// Operation:
|
||||
// dst[15:0] := Saturate16(rf[31:0])
|
||||
// dst[31:16] := Saturate16(rf[63:32])
|
||||
// ...
|
||||
// dst[255:240] := Saturate16(rf[255:224])
|
||||
__m256i lo = _mm256_castps_si256(extract256<0>(rf));
|
||||
__m256i hi = _mm256_castps_si256(extract256<1>(rf));
|
||||
__m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
|
||||
_mm256_extractf128_si256(lo, 1));
|
||||
__m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
|
||||
_mm256_extractf128_si256(hi, 1));
|
||||
return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
|
||||
const Packet16i& b) {
|
||||
return _mm512_and_si512(a,b);
|
||||
}
|
||||
|
||||
template <>
|
||||
|
@ -255,24 +322,7 @@ EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
|
|||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
return _mm512_and_ps(a, b);
|
||||
#else
|
||||
Packet16f res = _mm512_undefined_ps();
|
||||
Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
|
||||
res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0);
|
||||
|
||||
Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
|
||||
res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1);
|
||||
|
||||
Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
|
||||
res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2);
|
||||
|
||||
Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
|
||||
res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3);
|
||||
|
||||
return res;
|
||||
return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b)));
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
|
@ -288,35 +338,21 @@ EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a,
|
|||
|
||||
Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
|
||||
res = _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
|
||||
|
||||
return res;
|
||||
return _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a,
|
||||
const Packet16f& b) {
|
||||
EIGEN_STRONG_INLINE Packet16i por<Packet16i>(const Packet16i& a, const Packet16i& b) {
|
||||
return _mm512_or_si512(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
return _mm512_or_ps(a, b);
|
||||
#else
|
||||
Packet16f res = _mm512_undefined_ps();
|
||||
Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
|
||||
res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0);
|
||||
|
||||
Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
|
||||
res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1);
|
||||
|
||||
Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
|
||||
res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2);
|
||||
|
||||
Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
|
||||
res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3);
|
||||
|
||||
return res;
|
||||
return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b)));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -326,109 +362,67 @@ EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
|
|||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
return _mm512_or_pd(a, b);
|
||||
#else
|
||||
Packet8d res = _mm512_undefined_pd();
|
||||
Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
|
||||
res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0);
|
||||
|
||||
Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
|
||||
res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1);
|
||||
|
||||
return res;
|
||||
return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a,
|
||||
const Packet16f& b) {
|
||||
EIGEN_STRONG_INLINE Packet16i pxor<Packet16i>(const Packet16i& a, const Packet16i& b) {
|
||||
return _mm512_xor_si512(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
return _mm512_xor_ps(a, b);
|
||||
#else
|
||||
Packet16f res = _mm512_undefined_ps();
|
||||
Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
|
||||
res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0);
|
||||
|
||||
Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
|
||||
res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1);
|
||||
|
||||
Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
|
||||
res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2);
|
||||
|
||||
Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
|
||||
res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3);
|
||||
|
||||
return res;
|
||||
return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b)));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a,
|
||||
const Packet8d& b) {
|
||||
EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, const Packet8d& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
return _mm512_xor_pd(a, b);
|
||||
#else
|
||||
Packet8d res = _mm512_undefined_pd();
|
||||
Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
|
||||
res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0);
|
||||
|
||||
Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
|
||||
res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1);
|
||||
|
||||
return res;
|
||||
return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a,
|
||||
const Packet16f& b) {
|
||||
EIGEN_STRONG_INLINE Packet16i pandnot<Packet16i>(const Packet16i& a, const Packet16i& b) {
|
||||
return _mm512_andnot_si512(b, a);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
return _mm512_andnot_ps(a, b);
|
||||
return _mm512_andnot_ps(b, a);
|
||||
#else
|
||||
Packet16f res = _mm512_undefined_ps();
|
||||
Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
|
||||
res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0);
|
||||
|
||||
Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
|
||||
res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1);
|
||||
|
||||
Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
|
||||
res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2);
|
||||
|
||||
Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
|
||||
res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3);
|
||||
|
||||
return res;
|
||||
return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b)));
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,
|
||||
const Packet8d& b) {
|
||||
EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,const Packet8d& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
return _mm512_andnot_pd(a, b);
|
||||
return _mm512_andnot_pd(b, a);
|
||||
#else
|
||||
Packet8d res = _mm512_undefined_pd();
|
||||
Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
|
||||
res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0);
|
||||
|
||||
Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
|
||||
res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1);
|
||||
|
||||
return res;
|
||||
return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
|
||||
#endif
|
||||
}
|
||||
|
||||
template<int N> EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
|
||||
return _mm512_srai_epi32(a, N);
|
||||
}
|
||||
|
||||
template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) {
|
||||
return _mm512_srli_epi32(a, N);
|
||||
}
|
||||
|
||||
template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
|
||||
return _mm512_slli_epi32(a, N);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
|
||||
EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from);
|
||||
|
@ -461,75 +455,55 @@ EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
|
|||
// {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
|
||||
Packet8f lane0 = _mm256_broadcast_ps((const __m128*)(const void*)from);
|
||||
// mimic an "inplace" permutation of the lower 128bits using a blend
|
||||
lane0 = _mm256_blend_ps(
|
||||
lane0, _mm256_castps128_ps256(_mm_permute_ps(
|
||||
_mm256_castps256_ps128(lane0), _MM_SHUFFLE(1, 0, 1, 0))),
|
||||
15);
|
||||
// then we can perform a consistent permutation on the global register to get
|
||||
// everything in shape:
|
||||
lane0 = _mm256_permute_ps(lane0, _MM_SHUFFLE(3, 3, 2, 2));
|
||||
|
||||
Packet8f lane1 = _mm256_broadcast_ps((const __m128*)(const void*)(from + 4));
|
||||
// mimic an "inplace" permutation of the lower 128bits using a blend
|
||||
lane1 = _mm256_blend_ps(
|
||||
lane1, _mm256_castps128_ps256(_mm_permute_ps(
|
||||
_mm256_castps256_ps128(lane1), _MM_SHUFFLE(1, 0, 1, 0))),
|
||||
15);
|
||||
// then we can perform a consistent permutation on the global register to get
|
||||
// everything in shape:
|
||||
lane1 = _mm256_permute_ps(lane1, _MM_SHUFFLE(3, 3, 2, 2));
|
||||
// an unaligned load is required here as there is no requirement
|
||||
// on the alignment of input pointer 'from'
|
||||
__m256i low_half = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
|
||||
__m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
|
||||
__m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
return pairs;
|
||||
}
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
Packet16f res = _mm512_undefined_ps();
|
||||
return _mm512_insertf32x8(res, lane0, 0);
|
||||
return _mm512_insertf32x8(res, lane1, 1);
|
||||
return res;
|
||||
#else
|
||||
Packet16f res = _mm512_undefined_ps();
|
||||
res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 0), 0);
|
||||
res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 1), 1);
|
||||
res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 0), 2);
|
||||
res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 1), 3);
|
||||
return res;
|
||||
#endif
|
||||
}
|
||||
// FIXME: this does not look optimal, better load a Packet4d and shuffle...
|
||||
// Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3,
|
||||
// a3}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
|
||||
Packet4d lane0 = _mm256_broadcast_pd((const __m128d*)(const void*)from);
|
||||
lane0 = _mm256_permute_pd(lane0, 3 << 2);
|
||||
|
||||
Packet4d lane1 = _mm256_broadcast_pd((const __m128d*)(const void*)(from + 2));
|
||||
lane1 = _mm256_permute_pd(lane1, 3 << 2);
|
||||
|
||||
Packet8d res = _mm512_undefined_pd();
|
||||
res = _mm512_insertf64x4(res, lane0, 0);
|
||||
return _mm512_insertf64x4(res, lane1, 1);
|
||||
__m512d x = _mm512_setzero_pd();
|
||||
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0);
|
||||
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1);
|
||||
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2);
|
||||
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
|
||||
return x;
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
|
||||
__m512d x = _mm512_setzero_pd();
|
||||
x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
|
||||
x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
|
||||
x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
|
||||
x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
|
||||
return x;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Loads 4 floats from memory a returns the packet
|
||||
// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
|
||||
Packet16f tmp = _mm512_undefined_ps();
|
||||
tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0);
|
||||
tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1);
|
||||
tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2);
|
||||
tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3);
|
||||
return tmp;
|
||||
Packet16f tmp = _mm512_castps128_ps512(ploadu<Packet4f>(from));
|
||||
const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
|
||||
return _mm512_permutexvar_ps(scatter_mask, tmp);
|
||||
}
|
||||
|
||||
// Loads 2 doubles from memory a returns the packet
|
||||
// {a0, a0 a0, a0, a1, a1, a1, a1}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
|
||||
Packet8d tmp = _mm512_undefined_pd();
|
||||
Packet2d tmp0 = _mm_load_pd1(from);
|
||||
Packet2d tmp1 = _mm_load_pd1(from + 1);
|
||||
Packet4d lane0 = _mm256_broadcastsd_pd(tmp0);
|
||||
Packet4d lane1 = _mm256_broadcastsd_pd(tmp1);
|
||||
__m256d lane0 = _mm256_set1_pd(*from);
|
||||
__m256d lane1 = _mm256_set1_pd(*(from+1));
|
||||
__m512d tmp = _mm512_undefined_pd();
|
||||
tmp = _mm512_insertf64x4(tmp, lane0, 0);
|
||||
return _mm512_insertf64x4(tmp, lane1, 1);
|
||||
}
|
||||
|
@ -565,7 +539,7 @@ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
|
|||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
|
||||
Index stride) {
|
||||
Packet16i stride_vector = _mm512_set1_epi32(stride);
|
||||
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
|
||||
Packet16i stride_multiplier =
|
||||
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
|
||||
|
@ -575,7 +549,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
|
|||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
|
||||
Index stride) {
|
||||
Packet8i stride_vector = _mm256_set1_epi32(stride);
|
||||
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
|
||||
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
|
||||
|
||||
|
@ -586,7 +560,7 @@ template <>
|
|||
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
|
||||
const Packet16f& from,
|
||||
Index stride) {
|
||||
Packet16i stride_vector = _mm512_set1_epi32(stride);
|
||||
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
|
||||
Packet16i stride_multiplier =
|
||||
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
|
||||
|
@ -596,7 +570,7 @@ template <>
|
|||
EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
|
||||
const Packet8d& from,
|
||||
Index stride) {
|
||||
Packet8i stride_vector = _mm256_set1_epi32(stride);
|
||||
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
|
||||
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
|
||||
_mm512_i32scatter_pd(to, indices, from, 8);
|
||||
|
@ -618,9 +592,9 @@ EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
|
|||
pstore(to, pa);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
|
||||
|
@ -648,20 +622,20 @@ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
|
|||
template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
|
||||
{
|
||||
// _mm512_abs_ps intrinsic not found, so hack around it
|
||||
return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff));
|
||||
return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
|
||||
// _mm512_abs_ps intrinsic not found, so hack around it
|
||||
return (__m512d)_mm512_and_si512((__m512i)a,
|
||||
_mm512_set1_epi64(0x7fffffffffffffff));
|
||||
return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
|
||||
_mm512_set1_epi64(0x7fffffffffffffff)));
|
||||
}
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
|
||||
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
|
||||
__m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0) __m256 OUTPUT##_1 = \
|
||||
_mm512_extractf32x8_ps(INPUT, 1)
|
||||
__m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
|
||||
__m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
|
||||
#else
|
||||
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
|
||||
__m256 OUTPUT##_0 = _mm256_insertf128_ps( \
|
||||
|
@ -674,17 +648,136 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
|
|||
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
|
||||
OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \
|
||||
OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1);
|
||||
OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
|
||||
#else
|
||||
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
|
||||
OUTPUT = _mm512_undefined_ps(); \
|
||||
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
|
||||
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
|
||||
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
|
||||
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
|
||||
#endif
|
||||
template<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f*
|
||||
vecs)
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
__m256 lane0 = _mm512_extractf32x8_ps(a, 0);
|
||||
__m256 lane1 = _mm512_extractf32x8_ps(a, 1);
|
||||
Packet8f x = _mm256_add_ps(lane0, lane1);
|
||||
return predux<Packet8f>(x);
|
||||
#else
|
||||
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
__m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3));
|
||||
sum = _mm_hadd_ps(sum, sum);
|
||||
sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
|
||||
return _mm_cvtss_f32(sum);
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
|
||||
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
__m256d sum = _mm256_add_pd(lane0, lane1);
|
||||
__m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
|
||||
return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0)));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
|
||||
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
|
||||
return padd(lane0, lane1);
|
||||
#else
|
||||
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f sum0 = padd(lane0, lane2);
|
||||
Packet4f sum1 = padd(lane1, lane3);
|
||||
return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
|
||||
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d res = padd(lane0, lane1);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
|
||||
//#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
#if 0
|
||||
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
|
||||
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
|
||||
Packet8f res = pmul(lane0, lane1);
|
||||
res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
|
||||
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
#else
|
||||
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
__m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
|
||||
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
|
||||
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
__m256d res = pmul(lane0, lane1);
|
||||
res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
|
||||
return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
|
||||
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
__m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
|
||||
res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
|
||||
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
__m256d res = _mm256_min_pd(lane0, lane1);
|
||||
res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
|
||||
return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
|
||||
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
__m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
|
||||
res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
|
||||
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
__m256d res = _mm256_max_pd(lane0, lane1);
|
||||
res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
|
||||
return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f* vecs)
|
||||
{
|
||||
EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0);
|
||||
EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1);
|
||||
|
@ -874,173 +967,6 @@ template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs)
|
|||
return _mm512_insertf64x4(final_output, final_1, 1);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
|
||||
//#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
#if 0
|
||||
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
|
||||
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
|
||||
Packet8f sum = padd(lane0, lane1);
|
||||
Packet8f tmp0 = _mm256_hadd_ps(sum, _mm256_permute2f128_ps(a, a, 1));
|
||||
tmp0 = _mm256_hadd_ps(tmp0, tmp0);
|
||||
return pfirst(_mm256_hadd_ps(tmp0, tmp0));
|
||||
#else
|
||||
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f sum = padd(padd(lane0, lane1), padd(lane2, lane3));
|
||||
sum = _mm_hadd_ps(sum, sum);
|
||||
sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
|
||||
return pfirst(sum);
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
|
||||
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d sum = padd(lane0, lane1);
|
||||
Packet4d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
|
||||
return pfirst(_mm256_hadd_pd(tmp0, tmp0));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
|
||||
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
|
||||
return padd(lane0, lane1);
|
||||
#else
|
||||
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f sum0 = padd(lane0, lane2);
|
||||
Packet4f sum1 = padd(lane1, lane3);
|
||||
return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
|
||||
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d res = padd(lane0, lane1);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
|
||||
//#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
#if 0
|
||||
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
|
||||
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
|
||||
Packet8f res = pmul(lane0, lane1);
|
||||
res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
|
||||
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
#else
|
||||
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
|
||||
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
|
||||
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d res = pmul(lane0, lane1);
|
||||
res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
|
||||
return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
|
||||
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
|
||||
res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
|
||||
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d res = _mm256_min_pd(lane0, lane1);
|
||||
res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
|
||||
return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
|
||||
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
|
||||
res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
|
||||
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d res = _mm256_max_pd(lane0, lane1);
|
||||
res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
|
||||
return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
|
||||
}
|
||||
|
||||
template <int Offset>
|
||||
struct palign_impl<Offset, Packet16f> {
|
||||
static EIGEN_STRONG_INLINE void run(Packet16f& first,
|
||||
const Packet16f& second) {
|
||||
if (Offset != 0) {
|
||||
__m512i first_idx = _mm512_set_epi32(
|
||||
Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11,
|
||||
Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6,
|
||||
Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset);
|
||||
|
||||
__m512i second_idx =
|
||||
_mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4,
|
||||
Offset - 5, Offset - 6, Offset - 7, Offset - 8,
|
||||
Offset - 9, Offset - 10, Offset - 11, Offset - 12,
|
||||
Offset - 13, Offset - 14, Offset - 15, Offset - 16);
|
||||
|
||||
unsigned short mask = 0xFFFF;
|
||||
mask <<= (16 - Offset);
|
||||
|
||||
first = _mm512_permutexvar_ps(first_idx, first);
|
||||
Packet16f tmp = _mm512_permutexvar_ps(second_idx, second);
|
||||
first = _mm512_mask_blend_ps(mask, first, tmp);
|
||||
}
|
||||
}
|
||||
};
|
||||
template <int Offset>
|
||||
struct palign_impl<Offset, Packet8d> {
|
||||
static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) {
|
||||
if (Offset != 0) {
|
||||
__m512i first_idx = _mm512_set_epi32(
|
||||
0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0,
|
||||
Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset);
|
||||
|
||||
__m512i second_idx = _mm512_set_epi32(
|
||||
0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0,
|
||||
Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8);
|
||||
|
||||
unsigned char mask = 0xFF;
|
||||
mask <<= (8 - Offset);
|
||||
|
||||
first = _mm512_permutexvar_pd(first_idx, first);
|
||||
Packet8d tmp = _mm512_permutexvar_pd(second_idx, second);
|
||||
first = _mm512_mask_blend_pd(mask, first, tmp);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
|
||||
|
@ -1302,13 +1228,76 @@ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
|
|||
return Packet16f();
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/,
|
||||
const Packet8d& /*thenPacket*/,
|
||||
const Packet8d& /*elsePacket*/) {
|
||||
assert(false && "To be implemented");
|
||||
return Packet8d();
|
||||
EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
|
||||
const Packet8d& thenPacket,
|
||||
const Packet8d& elsePacket) {
|
||||
__mmask8 m = (ifPacket.select[0] )
|
||||
| (ifPacket.select[1]<<1)
|
||||
| (ifPacket.select[2]<<2)
|
||||
| (ifPacket.select[3]<<3)
|
||||
| (ifPacket.select[4]<<4)
|
||||
| (ifPacket.select[5]<<5)
|
||||
| (ifPacket.select[6]<<6)
|
||||
| (ifPacket.select[7]<<7);
|
||||
return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
|
||||
return _mm512_cvttps_epi32(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
|
||||
return _mm512_cvtepi32_ps(a);
|
||||
}
|
||||
|
||||
template <int Offset>
|
||||
struct palign_impl<Offset, Packet16f> {
|
||||
static EIGEN_STRONG_INLINE void run(Packet16f& first,
|
||||
const Packet16f& second) {
|
||||
if (Offset != 0) {
|
||||
__m512i first_idx = _mm512_set_epi32(
|
||||
Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11,
|
||||
Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6,
|
||||
Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset);
|
||||
|
||||
__m512i second_idx =
|
||||
_mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4,
|
||||
Offset - 5, Offset - 6, Offset - 7, Offset - 8,
|
||||
Offset - 9, Offset - 10, Offset - 11, Offset - 12,
|
||||
Offset - 13, Offset - 14, Offset - 15, Offset - 16);
|
||||
|
||||
unsigned short mask = 0xFFFF;
|
||||
mask <<= (16 - Offset);
|
||||
|
||||
first = _mm512_permutexvar_ps(first_idx, first);
|
||||
Packet16f tmp = _mm512_permutexvar_ps(second_idx, second);
|
||||
first = _mm512_mask_blend_ps(mask, first, tmp);
|
||||
}
|
||||
}
|
||||
};
|
||||
template <int Offset>
|
||||
struct palign_impl<Offset, Packet8d> {
|
||||
static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) {
|
||||
if (Offset != 0) {
|
||||
__m512i first_idx = _mm512_set_epi32(
|
||||
0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0,
|
||||
Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset);
|
||||
|
||||
__m512i second_idx = _mm512_set_epi32(
|
||||
0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0,
|
||||
Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8);
|
||||
|
||||
unsigned char mask = 0xFF;
|
||||
mask <<= (8 - Offset);
|
||||
|
||||
first = _mm512_permutexvar_pd(first_idx, first);
|
||||
Packet8d tmp = _mm512_permutexvar_pd(second_idx, second);
|
||||
first = _mm512_mask_blend_pd(mask, first, tmp);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
|
|
@ -65,7 +65,7 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
|
|||
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
|
||||
{
|
||||
Packet2cf res;
|
||||
if((ptrdiff_t(&from) % 16) == 0)
|
||||
if((std::ptrdiff_t(&from) % 16) == 0)
|
||||
res.v = pload<Packet4f>((const float *)&from);
|
||||
else
|
||||
res.v = ploadu<Packet4f>((const float *)&from);
|
||||
|
@ -224,23 +224,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
|
|||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet4f, Packet2cf, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
|
||||
{ return Packet2cf(internal::pmul<Packet4f>(x, y.v)); }
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet2cf, Packet4f, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
|
||||
{ return Packet2cf(internal::pmul<Packet4f>(x.v, y)); }
|
||||
};
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
|
||||
{
|
||||
|
@ -416,23 +400,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
|
|||
return pconj(internal::pmul(a, b));
|
||||
}
|
||||
};
|
||||
template<> struct conj_helper<Packet2d, Packet1cd, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
|
||||
{ return Packet1cd(internal::pmul<Packet2d>(x, y.v)); }
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet1cd, Packet2d, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
|
||||
{ return Packet1cd(internal::pmul<Packet2d>(x.v, y)); }
|
||||
};
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
|
||||
{
|
||||
|
|
|
@ -90,7 +90,7 @@ static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
|
|||
#define _EIGEN_MASK_ALIGNMENT 0xfffffff0
|
||||
#endif
|
||||
|
||||
#define _EIGEN_ALIGNED_PTR(x) ((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
|
||||
#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
|
||||
|
||||
// Handle endianness properly while loading constants
|
||||
// Define global static constants:
|
||||
|
@ -388,10 +388,28 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
|
|||
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
|
||||
{
|
||||
#ifdef __VSX__
|
||||
Packet4f ret;
|
||||
__asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
|
||||
return ret;
|
||||
#else
|
||||
return vec_min(a, b);
|
||||
#endif
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
|
||||
{
|
||||
#ifdef __VSX__
|
||||
Packet4f ret;
|
||||
__asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
|
||||
return ret;
|
||||
#else
|
||||
return vec_max(a, b);
|
||||
#endif
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
|
||||
|
@ -450,14 +468,14 @@ template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
|
|||
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
|
||||
{
|
||||
Packet4f p;
|
||||
if((ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from);
|
||||
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from);
|
||||
else p = ploadu<Packet4f>(from);
|
||||
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
|
||||
{
|
||||
Packet4i p;
|
||||
if((ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from);
|
||||
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from);
|
||||
else p = ploadu<Packet4i>(from);
|
||||
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
|
||||
}
|
||||
|
@ -910,9 +928,19 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const
|
|||
// for some weird raisons, it has to be overloaded for packet of integers
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
|
||||
{
|
||||
Packet2d ret;
|
||||
__asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
|
||||
{
|
||||
Packet2d ret;
|
||||
__asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
|
||||
|
||||
|
@ -935,7 +963,7 @@ template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
|
|||
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
|
||||
{
|
||||
Packet2d p;
|
||||
if((ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from);
|
||||
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from);
|
||||
else p = ploadu<Packet2d>(from);
|
||||
return vec_splat_dbl<0>(p);
|
||||
}
|
||||
|
@ -1022,7 +1050,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
|
|||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
|
||||
Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
|
||||
Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
|
||||
Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
|
||||
return vec_sel(elsePacket, thenPacket, mask);
|
||||
}
|
||||
#endif // __VSX__
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted.
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
|
@ -29,7 +29,7 @@
|
|||
// type Eigen::half (inheriting from CUDA's __half struct) with
|
||||
// operator overloads such that it behaves basically as an arithmetic
|
||||
// type. It will be quite slow on CPUs (so it is recommended to stay
|
||||
// in fp32 for CPUs, except for simple parameter conversions, I/O
|
||||
// in float32_bits for CPUs, except for simple parameter conversions, I/O
|
||||
// to disk and the likes), but fast on GPUs.
|
||||
|
||||
|
||||
|
@ -42,6 +42,7 @@
|
|||
#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
|
||||
#endif
|
||||
|
||||
#include <sstream>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
|
@ -50,38 +51,45 @@ struct half;
|
|||
namespace half_impl {
|
||||
|
||||
#if !defined(EIGEN_HAS_CUDA_FP16)
|
||||
|
||||
// Make our own __half definition that is similar to CUDA's.
|
||||
struct __half {
|
||||
EIGEN_DEVICE_FUNC __half() {}
|
||||
explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
|
||||
// Make our own __half_raw definition that is similar to CUDA's.
|
||||
struct __half_raw {
|
||||
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
|
||||
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
|
||||
unsigned short x;
|
||||
};
|
||||
|
||||
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
|
||||
typedef __half __half_raw;
|
||||
#endif
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
|
||||
|
||||
struct half_base : public __half {
|
||||
struct half_base : public __half_raw {
|
||||
EIGEN_DEVICE_FUNC half_base() {}
|
||||
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {}
|
||||
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {}
|
||||
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
|
||||
EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace half_impl
|
||||
|
||||
// Class definition.
|
||||
struct half : public half_impl::half_base {
|
||||
#if !defined(EIGEN_HAS_CUDA_FP16)
|
||||
typedef half_impl::__half __half;
|
||||
#if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
|
||||
typedef half_impl::__half_raw __half_raw;
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC half() {}
|
||||
|
||||
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
||||
EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
|
||||
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
||||
#endif
|
||||
|
||||
explicit EIGEN_DEVICE_FUNC half(bool b)
|
||||
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
|
||||
|
@ -138,71 +146,125 @@ struct half : public half_impl::half_base {
|
|||
}
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
namespace std {
|
||||
template<>
|
||||
struct numeric_limits<Eigen::half> {
|
||||
static const bool is_specialized = true;
|
||||
static const bool is_signed = true;
|
||||
static const bool is_integer = false;
|
||||
static const bool is_exact = false;
|
||||
static const bool has_infinity = true;
|
||||
static const bool has_quiet_NaN = true;
|
||||
static const bool has_signaling_NaN = true;
|
||||
static const float_denorm_style has_denorm = denorm_present;
|
||||
static const bool has_denorm_loss = false;
|
||||
static const std::float_round_style round_style = std::round_to_nearest;
|
||||
static const bool is_iec559 = false;
|
||||
static const bool is_bounded = false;
|
||||
static const bool is_modulo = false;
|
||||
static const int digits = 11;
|
||||
static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
|
||||
static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
|
||||
static const int radix = 2;
|
||||
static const int min_exponent = -13;
|
||||
static const int min_exponent10 = -4;
|
||||
static const int max_exponent = 16;
|
||||
static const int max_exponent10 = 4;
|
||||
static const bool traps = true;
|
||||
static const bool tinyness_before = false;
|
||||
|
||||
static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
|
||||
static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
|
||||
static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
|
||||
static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
|
||||
static Eigen::half round_error() { return Eigen::half(0.5); }
|
||||
static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
|
||||
static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
|
||||
static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
|
||||
static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
|
||||
};
|
||||
|
||||
// If std::numeric_limits<T> is specialized, should also specialize
|
||||
// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
|
||||
// std::numeric_limits<const volatile T>
|
||||
// https://stackoverflow.com/a/16519653/
|
||||
template<>
|
||||
struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {};
|
||||
template<>
|
||||
struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {};
|
||||
template<>
|
||||
struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {};
|
||||
} // end namespace std
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace half_impl {
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
|
||||
// Intrinsics for native fp16 support. Note that on current hardware,
|
||||
// these are no faster than fp32 arithmetic (you need to use the half2
|
||||
// these are no faster than float32_bits arithmetic (you need to use the half2
|
||||
// versions to get the ALU speed increased), but you do save the
|
||||
// conversion steps back and forth.
|
||||
|
||||
__device__ half operator + (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
|
||||
return __hadd(a, b);
|
||||
}
|
||||
__device__ half operator * (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
|
||||
return __hmul(a, b);
|
||||
}
|
||||
__device__ half operator - (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
|
||||
return __hsub(a, b);
|
||||
}
|
||||
__device__ half operator / (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
|
||||
float num = __half2float(a);
|
||||
float denom = __half2float(b);
|
||||
return __float2half(num / denom);
|
||||
}
|
||||
__device__ half operator - (const half& a) {
|
||||
EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
|
||||
return __hneg(a);
|
||||
}
|
||||
__device__ half& operator += (half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
|
||||
a = a + b;
|
||||
return a;
|
||||
}
|
||||
__device__ half& operator *= (half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
|
||||
a = a * b;
|
||||
return a;
|
||||
}
|
||||
__device__ half& operator -= (half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
|
||||
a = a - b;
|
||||
return a;
|
||||
}
|
||||
__device__ half& operator /= (half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
|
||||
a = a / b;
|
||||
return a;
|
||||
}
|
||||
__device__ bool operator == (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
|
||||
return __heq(a, b);
|
||||
}
|
||||
__device__ bool operator != (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
|
||||
return __hne(a, b);
|
||||
}
|
||||
__device__ bool operator < (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
|
||||
return __hlt(a, b);
|
||||
}
|
||||
__device__ bool operator <= (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
|
||||
return __hle(a, b);
|
||||
}
|
||||
__device__ bool operator > (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
|
||||
return __hgt(a, b);
|
||||
}
|
||||
__device__ bool operator >= (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
|
||||
return __hge(a, b);
|
||||
}
|
||||
|
||||
#else // Emulate support for half floats
|
||||
|
||||
// Definitions for CPUs and older CUDA, mostly working through conversion
|
||||
// to/from fp32.
|
||||
// to/from float32_bits.
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
|
||||
return half(float(a) + float(b));
|
||||
|
@ -238,10 +300,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b)
|
|||
return a;
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
|
||||
return float(a) == float(b);
|
||||
return numext::equal_strict(float(a),float(b));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
|
||||
return float(a) != float(b);
|
||||
return numext::not_equal_strict(float(a), float(b));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
|
||||
return float(a) < float(b);
|
||||
|
@ -269,34 +331,35 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
|
|||
// these in hardware. If we need more performance on older/other CPUs, they are
|
||||
// also possible to vectorize directly.
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
|
||||
__half h;
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
|
||||
__half_raw h;
|
||||
h.x = x;
|
||||
return h;
|
||||
}
|
||||
|
||||
union FP32 {
|
||||
union float32_bits {
|
||||
unsigned int u;
|
||||
float f;
|
||||
};
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
return __float2half(ff);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
__half tmp_ff = __float2half(ff);
|
||||
return *(__half_raw*)&tmp_ff;
|
||||
|
||||
#elif defined(EIGEN_HAS_FP16_C)
|
||||
__half h;
|
||||
__half_raw h;
|
||||
h.x = _cvtss_sh(ff, 0);
|
||||
return h;
|
||||
|
||||
#else
|
||||
FP32 f; f.f = ff;
|
||||
float32_bits f; f.f = ff;
|
||||
|
||||
const FP32 f32infty = { 255 << 23 };
|
||||
const FP32 f16max = { (127 + 16) << 23 };
|
||||
const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
|
||||
const float32_bits f32infty = { 255 << 23 };
|
||||
const float32_bits f16max = { (127 + 16) << 23 };
|
||||
const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
|
||||
unsigned int sign_mask = 0x80000000u;
|
||||
__half o;
|
||||
__half_raw o;
|
||||
o.x = static_cast<unsigned short>(0x0u);
|
||||
|
||||
unsigned int sign = f.u & sign_mask;
|
||||
|
@ -335,17 +398,17 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
|
|||
#endif
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
return __half2float(h);
|
||||
|
||||
#elif defined(EIGEN_HAS_FP16_C)
|
||||
return _cvtsh_ss(h.x);
|
||||
|
||||
#else
|
||||
const FP32 magic = { 113 << 23 };
|
||||
const float32_bits magic = { 113 << 23 };
|
||||
const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
|
||||
FP32 o;
|
||||
float32_bits o;
|
||||
|
||||
o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
|
||||
unsigned int exp = shifted_exp & o.u; // just the exponent
|
||||
|
@ -370,7 +433,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
|
|||
return (a.x & 0x7fff) == 0x7c00;
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
return __hisnan(a);
|
||||
#else
|
||||
return (a.x & 0x7fff) > 0x7c00;
|
||||
|
@ -386,11 +449,15 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
|
|||
return result;
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||
return half(hexp(a));
|
||||
#else
|
||||
return half(::expf(float(a)));
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
return Eigen::half(::hlog(a));
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
return half(::hlog(a));
|
||||
#else
|
||||
return half(::logf(float(a)));
|
||||
#endif
|
||||
|
@ -402,7 +469,11 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
|
|||
return half(::log10f(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||
return half(hsqrt(a));
|
||||
#else
|
||||
return half(::sqrtf(float(a)));
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
|
||||
return half(::powf(float(a), float(b)));
|
||||
|
@ -420,14 +491,22 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
|
|||
return half(::tanhf(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
|
||||
return half(hfloor(a));
|
||||
#else
|
||||
return half(::floorf(float(a)));
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
|
||||
return half(hceil(a));
|
||||
#else
|
||||
return half(::ceilf(float(a)));
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
return __hlt(b, a) ? b : a;
|
||||
#else
|
||||
const float f1 = static_cast<float>(a);
|
||||
|
@ -436,7 +515,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
|
|||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
return __hlt(a, b) ? b : a;
|
||||
#else
|
||||
const float f1 = static_cast<float>(a);
|
||||
|
@ -477,6 +556,13 @@ template<> struct is_arithmetic<half> { enum { value = true }; };
|
|||
template<> struct NumTraits<Eigen::half>
|
||||
: GenericNumTraits<Eigen::half>
|
||||
{
|
||||
enum {
|
||||
IsSigned = true,
|
||||
IsInteger = false,
|
||||
IsComplex = false,
|
||||
RequireInitialization = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
|
||||
return half_impl::raw_uint16_to_half(0x0800);
|
||||
}
|
||||
|
@ -507,7 +593,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
|
|||
return Eigen::half(::expf(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
return Eigen::half(::hlog(a));
|
||||
#else
|
||||
return Eigen::half(::logf(float(a)));
|
||||
|
@ -541,14 +627,18 @@ struct hash<Eigen::half> {
|
|||
|
||||
|
||||
// Add the missing shfl_xor intrinsic
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
|
||||
#if EIGEN_CUDACC_VER < 90000
|
||||
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
|
||||
#else
|
||||
return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
// ldg() has an overload for __half, but we also need one for Eigen::half.
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||
// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
|
||||
return Eigen::half_impl::raw_uint16_to_half(
|
||||
__ldg(reinterpret_cast<const unsigned short*>(ptr)));
|
||||
|
@ -556,7 +646,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
|
|||
#endif
|
||||
|
||||
|
||||
#if defined(__CUDA_ARCH__)
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
namespace Eigen {
|
||||
namespace numext {
|
||||
|
||||
|
|
|
@ -291,7 +291,7 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
|
|||
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<float4,4>& kernel) {
|
||||
double tmp = kernel.packet[0].y;
|
||||
float tmp = kernel.packet[0].y;
|
||||
kernel.packet[0].y = kernel.packet[1].x;
|
||||
kernel.packet[1].x = tmp;
|
||||
|
||||
|
|
|
@ -99,7 +99,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2&
|
|||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
|
||||
half2 result;
|
||||
result.x = a.x & 0x7FFF7FFF;
|
||||
unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
|
||||
*(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -229,7 +230,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2&
|
|||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2)));
|
||||
return Eigen::half(__float2half_rn(a1 + a2));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -263,7 +264,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const ha
|
|||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2)));
|
||||
return Eigen::half(__float2half_rn(a1 * a2));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -275,7 +276,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
|
|||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE
|
||||
half2 plog<half2>(const half2& a) {
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_ARCH_CONJ_HELPER_H
|
||||
#define EIGEN_ARCH_CONJ_HELPER_H
|
||||
|
||||
#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \
|
||||
template<> struct conj_helper<PACKET_REAL, PACKET_CPLX, false,false> { \
|
||||
EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \
|
||||
{ return padd(c, pmul(x,y)); } \
|
||||
EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \
|
||||
{ return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); } \
|
||||
}; \
|
||||
\
|
||||
template<> struct conj_helper<PACKET_CPLX, PACKET_REAL, false,false> { \
|
||||
EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \
|
||||
{ return padd(c, pmul(x,y)); } \
|
||||
EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \
|
||||
{ return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); } \
|
||||
};
|
||||
|
||||
#endif // EIGEN_ARCH_CONJ_HELPER_H
|
|
@ -67,7 +67,7 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
|
|||
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
|
||||
{
|
||||
float32x2_t r64;
|
||||
r64 = vld1_f32((float *)&from);
|
||||
r64 = vld1_f32((const float *)&from);
|
||||
|
||||
return Packet2cf(vcombine_f32(r64, r64));
|
||||
}
|
||||
|
@ -142,7 +142,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
|
|||
to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((float *)addr); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((const float *)addr); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
|
||||
{
|
||||
|
@ -265,6 +265,8 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
|
|||
}
|
||||
};
|
||||
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
|
||||
{
|
||||
// TODO optimize it for NEON
|
||||
|
@ -275,7 +277,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
|
|||
s = vmulq_f32(b.v, b.v);
|
||||
rev_s = vrev64q_f32(s);
|
||||
|
||||
return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
|
||||
return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s,rev_s)));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
|
@ -381,7 +383,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
|
|||
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ARM_PREFETCH((double *)addr); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ARM_PREFETCH((const double *)addr); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
|
||||
{
|
||||
|
@ -456,6 +458,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
|
|||
}
|
||||
};
|
||||
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
|
||||
{
|
||||
// TODO optimize it for NEON
|
||||
|
|
|
@ -36,29 +36,63 @@ namespace internal {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if EIGEN_COMP_MSVC
|
||||
|
||||
// In MSVC's arm_neon.h header file, all NEON vector types
|
||||
// are aliases to the same underlying type __n128.
|
||||
// We thus have to wrap them to make them different C++ types.
|
||||
// (See also bug 1428)
|
||||
|
||||
template<typename T,int unique_id>
|
||||
struct eigen_packet_wrapper
|
||||
{
|
||||
operator T&() { return m_val; }
|
||||
operator const T&() const { return m_val; }
|
||||
eigen_packet_wrapper() {}
|
||||
eigen_packet_wrapper(const T &v) : m_val(v) {}
|
||||
eigen_packet_wrapper& operator=(const T &v) {
|
||||
m_val = v;
|
||||
return *this;
|
||||
}
|
||||
|
||||
T m_val;
|
||||
};
|
||||
typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
|
||||
typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
|
||||
typedef eigen_packet_wrapper<int32x4_t ,2> Packet4i;
|
||||
typedef eigen_packet_wrapper<int32x2_t ,3> Packet2i;
|
||||
typedef eigen_packet_wrapper<uint32x4_t ,4> Packet4ui;
|
||||
|
||||
#else
|
||||
|
||||
typedef float32x2_t Packet2f;
|
||||
typedef float32x4_t Packet4f;
|
||||
typedef int32x4_t Packet4i;
|
||||
typedef int32x2_t Packet2i;
|
||||
typedef uint32x4_t Packet4ui;
|
||||
|
||||
#endif // EIGEN_COMP_MSVC
|
||||
|
||||
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
|
||||
const Packet4f p4f_##NAME = pset1<Packet4f>(X)
|
||||
|
||||
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
|
||||
const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
|
||||
const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
|
||||
|
||||
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
|
||||
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
|
||||
|
||||
// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
|
||||
// which available on LLVM and GCC (at least)
|
||||
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
|
||||
#if EIGEN_ARCH_ARM64
|
||||
// __builtin_prefetch tends to do nothing on ARM64 compilers because the
|
||||
// prefetch instructions there are too detailed for __builtin_prefetch to map
|
||||
// meaningfully to them.
|
||||
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
|
||||
#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
|
||||
#define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
|
||||
#elif defined __pld
|
||||
#define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
|
||||
#elif !EIGEN_ARCH_ARM64
|
||||
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( " pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
|
||||
#elif EIGEN_ARCH_ARM32
|
||||
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
|
||||
#else
|
||||
// by default no explicit prefetching
|
||||
#define EIGEN_ARM_PREFETCH(ADDR)
|
||||
|
@ -83,7 +117,7 @@ template<> struct packet_traits<float> : default_packet_traits
|
|||
HasSqrt = 0
|
||||
};
|
||||
};
|
||||
template<> struct packet_traits<int> : default_packet_traits
|
||||
template<> struct packet_traits<int32_t> : default_packet_traits
|
||||
{
|
||||
typedef Packet4i type;
|
||||
typedef Packet4i half; // Packet2i intrinsics not implemented yet
|
||||
|
@ -106,18 +140,18 @@ EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_
|
|||
#endif
|
||||
|
||||
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
|
||||
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
|
||||
template<> struct unpacket_traits<Packet4i> { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
|
||||
{
|
||||
const float32_t f[] = {0, 1, 2, 3};
|
||||
const float f[] = {0, 1, 2, 3};
|
||||
Packet4f countdown = vld1q_f32(f);
|
||||
return vaddq_f32(pset1<Packet4f>(a), countdown);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)
|
||||
template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
|
||||
{
|
||||
const int32_t i[] = {0, 1, 2, 3};
|
||||
Packet4i countdown = vld1q_s32(i);
|
||||
|
@ -241,10 +275,10 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con
|
|||
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
|
||||
{
|
||||
|
@ -253,7 +287,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
|
|||
hi = vld1_dup_f32(from+1);
|
||||
return vcombine_f32(lo, hi);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
|
||||
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
|
||||
{
|
||||
int32x2_t lo, hi;
|
||||
lo = vld1_dup_s32(from);
|
||||
|
@ -262,10 +296,10 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
|
|||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void pstore<float> (float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
|
||||
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
|
||||
template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<float> (float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
|
||||
{
|
||||
|
@ -276,7 +310,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
|
|||
res = vsetq_lane_f32(from[3*stride], res, 3);
|
||||
return res;
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
|
||||
{
|
||||
Packet4i res = pset1<Packet4i>(0);
|
||||
res = vsetq_lane_s32(from[0*stride], res, 0);
|
||||
|
@ -293,7 +327,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co
|
|||
to[stride*2] = vgetq_lane_f32(from, 2);
|
||||
to[stride*3] = vgetq_lane_f32(from, 3);
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
|
||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
|
||||
{
|
||||
to[stride*0] = vgetq_lane_s32(from, 0);
|
||||
to[stride*1] = vgetq_lane_s32(from, 1);
|
||||
|
@ -302,11 +336,11 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
|
|||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<float> (const float* addr) { EIGEN_ARM_PREFETCH(addr); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ARM_PREFETCH(addr); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
|
||||
|
||||
// FIXME only store the 2 first elements ?
|
||||
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
|
||||
template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
|
||||
template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
|
||||
float32x2_t a_lo, a_hi;
|
||||
|
@ -361,7 +395,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
|
|||
return sum;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
|
||||
template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
|
||||
{
|
||||
int32x2_t a_lo, a_hi, sum;
|
||||
|
||||
|
@ -408,7 +442,7 @@ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
|
|||
|
||||
return vget_lane_f32(prod, 0);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
|
||||
template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
|
||||
{
|
||||
int32x2_t a_lo, a_hi, prod;
|
||||
|
||||
|
@ -436,7 +470,7 @@ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
|
|||
return vget_lane_f32(min, 0);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
|
||||
template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
|
||||
{
|
||||
int32x2_t a_lo, a_hi, min;
|
||||
|
||||
|
@ -461,7 +495,7 @@ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
|
|||
return vget_lane_f32(max, 0);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
|
||||
template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
|
||||
{
|
||||
int32x2_t a_lo, a_hi, max;
|
||||
|
||||
|
|
|
@ -128,7 +128,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
|
|||
_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
|
||||
{
|
||||
|
@ -229,23 +229,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
|
|||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet4f, Packet2cf, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
|
||||
{ return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet2cf, Packet4f, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
|
||||
{ return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
|
||||
};
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
|
||||
{
|
||||
|
@ -340,7 +324,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
|
|||
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
|
||||
{
|
||||
|
@ -430,23 +414,7 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
|
|||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet2d, Packet1cd, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
|
||||
{ return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet1cd, Packet2d, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
|
||||
{ return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
|
||||
};
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
|
||||
{
|
||||
|
|
|
@ -28,7 +28,7 @@ namespace internal {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if (defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)
|
||||
#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX
|
||||
// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
|
||||
// have overloads for both types without linking error.
|
||||
// One solution is to increase ABI version using -fabi-version=4 (or greater).
|
||||
|
@ -409,10 +409,16 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double&
|
|||
pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
|
||||
}
|
||||
|
||||
#if EIGEN_COMP_PGI
|
||||
typedef const void * SsePrefetchPtrType;
|
||||
#else
|
||||
typedef const char * SsePrefetchPtrType;
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_VECTORIZE_AVX
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
#endif
|
||||
|
||||
#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
|
||||
|
@ -876,4 +882,14 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co
|
|||
|
||||
} // end namespace Eigen
|
||||
|
||||
#if EIGEN_COMP_PGI
|
||||
// PGI++ does not define the following intrinsics in C++ mode.
|
||||
static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); }
|
||||
static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
|
||||
static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); }
|
||||
static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
|
||||
static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
|
||||
static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_PACKET_MATH_SSE_H
|
||||
|
|
|
@ -14,6 +14,7 @@ namespace Eigen {
|
|||
|
||||
namespace internal {
|
||||
|
||||
#ifndef EIGEN_VECTORIZE_AVX
|
||||
template <>
|
||||
struct type_casting_traits<float, int> {
|
||||
enum {
|
||||
|
@ -23,11 +24,6 @@ struct type_casting_traits<float, int> {
|
|||
};
|
||||
};
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
|
||||
return _mm_cvttps_epi32(a);
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
struct type_casting_traits<int, float> {
|
||||
enum {
|
||||
|
@ -37,11 +33,6 @@ struct type_casting_traits<int, float> {
|
|||
};
|
||||
};
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
|
||||
return _mm_cvtepi32_ps(a);
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
struct type_casting_traits<double, float> {
|
||||
enum {
|
||||
|
@ -51,10 +42,6 @@ struct type_casting_traits<double, float> {
|
|||
};
|
||||
};
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
|
||||
return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
|
||||
}
|
||||
|
||||
template <>
|
||||
struct type_casting_traits<float, double> {
|
||||
enum {
|
||||
|
@ -63,6 +50,19 @@ struct type_casting_traits<float, double> {
|
|||
TgtCoeffRatio = 2
|
||||
};
|
||||
};
|
||||
#endif
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
|
||||
return _mm_cvttps_epi32(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
|
||||
return _mm_cvtepi32_ps(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
|
||||
return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
|
||||
// Simply discard the second half of the input
|
||||
|
|
|
@ -336,6 +336,9 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
|
|||
}
|
||||
};
|
||||
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
|
||||
{
|
||||
// TODO optimize it for AltiVec
|
||||
|
|
|
@ -100,7 +100,7 @@ static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
|
|||
// Mask alignment
|
||||
#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
|
||||
|
||||
#define _EIGEN_ALIGNED_PTR(x) ((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
|
||||
#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
|
||||
|
||||
// Handle endianness properly while loading constants
|
||||
// Define global static constants:
|
||||
|
|
|
@ -28,7 +28,7 @@ template<typename DstScalar,typename SrcScalar> struct assign_op {
|
|||
{ internal::pstoret<DstScalar,Packet,Alignment>(a,b); }
|
||||
};
|
||||
|
||||
// Empty overload for void type (used by PermutationMatrix
|
||||
// Empty overload for void type (used by PermutationMatrix)
|
||||
template<typename DstScalar> struct assign_op<DstScalar,void> {};
|
||||
|
||||
template<typename DstScalar,typename SrcScalar>
|
||||
|
|
|
@ -255,7 +255,7 @@ struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_NEQ> : binary_op_base<LhsScalar,Rh
|
|||
|
||||
|
||||
/** \internal
|
||||
* \brief Template functor to compute the hypot of two scalars
|
||||
* \brief Template functor to compute the hypot of two \b positive \b and \b real scalars
|
||||
*
|
||||
* \sa MatrixBase::stableNorm(), class Redux
|
||||
*/
|
||||
|
@ -263,22 +263,15 @@ template<typename Scalar>
|
|||
struct scalar_hypot_op<Scalar,Scalar> : binary_op_base<Scalar,Scalar>
|
||||
{
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
|
||||
// typedef typename NumTraits<Scalar>::Real result_type;
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const
|
||||
{
|
||||
EIGEN_USING_STD_MATH(sqrt)
|
||||
Scalar p, qp;
|
||||
if(_x>_y)
|
||||
{
|
||||
p = _x;
|
||||
qp = _y / p;
|
||||
}
|
||||
else
|
||||
{
|
||||
p = _y;
|
||||
qp = _x / p;
|
||||
}
|
||||
return p * sqrt(Scalar(1) + qp*qp);
|
||||
// This functor is used by hypotNorm only for which it is faster to first apply abs
|
||||
// on all coefficients prior to reduction through hypot.
|
||||
// This way we avoid calling abs on positive and real entries, and this also permits
|
||||
// to seamlessly handle complexes. Otherwise we would have to handle both real and complexes
|
||||
// through the same functor...
|
||||
return internal::positive_real_hypot(x,y);
|
||||
}
|
||||
};
|
||||
template<typename Scalar>
|
||||
|
|
|
@ -44,16 +44,16 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
|
|||
{
|
||||
linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
|
||||
m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
|
||||
m_interPacket(plset<Packet>(0)),
|
||||
m_flip(numext::abs(high)<numext::abs(low))
|
||||
{}
|
||||
|
||||
template<typename IndexType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const {
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
if(m_flip)
|
||||
return (i==0)? m_low : (m_high - (m_size1-i)*m_step);
|
||||
return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step);
|
||||
else
|
||||
return (i==m_size1)? m_high : (m_low + i*m_step);
|
||||
return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step);
|
||||
}
|
||||
|
||||
template<typename IndexType>
|
||||
|
@ -63,7 +63,7 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
|
|||
// [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
|
||||
if(m_flip)
|
||||
{
|
||||
Packet pi = padd(pset1<Packet>(Scalar(i-m_size1)),m_interPacket);
|
||||
Packet pi = plset<Packet>(Scalar(i-m_size1));
|
||||
Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
|
||||
if(i==0)
|
||||
res = pinsertfirst(res, m_low);
|
||||
|
@ -71,7 +71,7 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
|
|||
}
|
||||
else
|
||||
{
|
||||
Packet pi = padd(pset1<Packet>(Scalar(i)),m_interPacket);
|
||||
Packet pi = plset<Packet>(Scalar(i));
|
||||
Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
|
||||
if(i==m_size1-unpacket_traits<Packet>::size+1)
|
||||
res = pinsertlast(res, m_high);
|
||||
|
@ -83,7 +83,6 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
|
|||
const Scalar m_high;
|
||||
const Index m_size1;
|
||||
const Scalar m_step;
|
||||
const Packet m_interPacket;
|
||||
const bool m_flip;
|
||||
};
|
||||
|
||||
|
@ -93,8 +92,8 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/true>
|
|||
linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
|
||||
m_low(low),
|
||||
m_multiplier((high-low)/convert_index<Scalar>(num_steps<=1 ? 1 : num_steps-1)),
|
||||
m_divisor(convert_index<Scalar>(num_steps+high-low)/(high-low+1)),
|
||||
m_use_divisor((high+1)<(low+num_steps))
|
||||
m_divisor(convert_index<Scalar>((high>=low?num_steps:-num_steps)+(high-low))/((numext::abs(high-low)+1)==0?1:(numext::abs(high-low)+1))),
|
||||
m_use_divisor(num_steps>1 && (numext::abs(high-low)+1)<num_steps)
|
||||
{}
|
||||
|
||||
template<typename IndexType>
|
||||
|
|
|
@ -83,13 +83,17 @@ struct functor_traits<std::binder1st<T> >
|
|||
{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
|
||||
#endif
|
||||
|
||||
#if (__cplusplus < 201703L) && (EIGEN_COMP_MSVC < 1910)
|
||||
// std::unary_negate is deprecated since c++17 and will be removed in c++20
|
||||
template<typename T>
|
||||
struct functor_traits<std::unary_negate<T> >
|
||||
{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
|
||||
|
||||
// std::binary_negate is deprecated since c++17 and will be removed in c++20
|
||||
template<typename T>
|
||||
struct functor_traits<std::binary_negate<T> >
|
||||
{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_STDEXT_SUPPORT
|
||||
|
||||
|
|
|
@ -768,7 +768,7 @@ struct scalar_sign_op<Scalar,true> {
|
|||
if (aa==real_type(0))
|
||||
return Scalar(0);
|
||||
aa = real_type(1)/aa;
|
||||
return Scalar(real(a)*aa, imag(a)*aa );
|
||||
return Scalar(a.real()*aa, a.imag()*aa );
|
||||
}
|
||||
//TODO
|
||||
//template <typename Packet>
|
||||
|
|
|
@ -115,7 +115,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|||
// registers. However once the latency is hidden there is no point in
|
||||
// increasing the value of k, so we'll cap it at 320 (value determined
|
||||
// experimentally).
|
||||
const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
|
||||
// To avoid that k vanishes, we make k_cache at least as big as kr
|
||||
const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
|
||||
if (k_cache < k) {
|
||||
k = k_cache - (k_cache % kr);
|
||||
eigen_internal_assert(k > 0);
|
||||
|
@ -648,8 +649,8 @@ public:
|
|||
// Vectorized path
|
||||
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
|
||||
{
|
||||
dest.first = pset1<RealPacket>(real(*b));
|
||||
dest.second = pset1<RealPacket>(imag(*b));
|
||||
dest.first = pset1<RealPacket>(numext::real(*b));
|
||||
dest.second = pset1<RealPacket>(numext::imag(*b));
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
|
||||
|
@ -1197,10 +1198,16 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|||
EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
|
||||
RhsPacket B_0, B1, B2, B3, T0;
|
||||
|
||||
// NOTE: the begin/end asm comments below work around bug 935!
|
||||
// but they are not enough for gcc>=6 without FMA (bug 1637)
|
||||
#if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
|
||||
#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
|
||||
#else
|
||||
#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
|
||||
#endif
|
||||
#define EIGEN_GEBGP_ONESTEP(K) \
|
||||
do { \
|
||||
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
|
||||
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
||||
traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
|
||||
traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
|
||||
traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
|
||||
|
@ -1212,6 +1219,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|||
traits.madd(A1, B2, C6, B2); \
|
||||
traits.madd(A0, B3, C3, T0); \
|
||||
traits.madd(A1, B3, C7, B3); \
|
||||
EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
|
||||
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
|
||||
} while(false)
|
||||
|
||||
|
@ -1526,10 +1534,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|||
// The following piece of code wont work for 512 bit registers
|
||||
// Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
|
||||
// as nr (which is currently 4) for the return type.
|
||||
typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
|
||||
const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
|
||||
if ((SwappedTraits::LhsProgress % 4) == 0 &&
|
||||
(SwappedTraits::LhsProgress <= 8) &&
|
||||
(SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr))
|
||||
(SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
|
||||
{
|
||||
SAccPacket C0, C1, C2, C3;
|
||||
straits.initAcc(C0);
|
||||
|
|
|
@ -20,8 +20,9 @@ template<typename _LhsScalar, typename _RhsScalar> class level3_blocking;
|
|||
template<
|
||||
typename Index,
|
||||
typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
|
||||
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride>
|
||||
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride>
|
||||
{
|
||||
typedef gebp_traits<RhsScalar,LhsScalar> Traits;
|
||||
|
||||
|
@ -30,7 +31,7 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
|
|||
Index rows, Index cols, Index depth,
|
||||
const LhsScalar* lhs, Index lhsStride,
|
||||
const RhsScalar* rhs, Index rhsStride,
|
||||
ResScalar* res, Index resStride,
|
||||
ResScalar* res, Index resIncr, Index resStride,
|
||||
ResScalar alpha,
|
||||
level3_blocking<RhsScalar,LhsScalar>& blocking,
|
||||
GemmParallelInfo<Index>* info = 0)
|
||||
|
@ -39,8 +40,8 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
|
|||
general_matrix_matrix_product<Index,
|
||||
RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
|
||||
LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
|
||||
ColMajor>
|
||||
::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info);
|
||||
ColMajor,ResInnerStride>
|
||||
::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking,info);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -49,8 +50,9 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
|
|||
template<
|
||||
typename Index,
|
||||
typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
|
||||
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride>
|
||||
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride>
|
||||
{
|
||||
|
||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||
|
@ -59,17 +61,17 @@ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScala
|
|||
static void run(Index rows, Index cols, Index depth,
|
||||
const LhsScalar* _lhs, Index lhsStride,
|
||||
const RhsScalar* _rhs, Index rhsStride,
|
||||
ResScalar* _res, Index resStride,
|
||||
ResScalar* _res, Index resIncr, Index resStride,
|
||||
ResScalar alpha,
|
||||
level3_blocking<LhsScalar,RhsScalar>& blocking,
|
||||
GemmParallelInfo<Index>* info = 0)
|
||||
{
|
||||
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor,Unaligned,ResInnerStride> ResMapper;
|
||||
LhsMapper lhs(_lhs, lhsStride);
|
||||
RhsMapper rhs(_rhs, rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
ResMapper res(_res, resStride, resIncr);
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
|
@ -83,8 +85,8 @@ static void run(Index rows, Index cols, Index depth,
|
|||
if(info)
|
||||
{
|
||||
// this is the parallel version!
|
||||
Index tid = omp_get_thread_num();
|
||||
Index threads = omp_get_num_threads();
|
||||
int tid = omp_get_thread_num();
|
||||
int threads = omp_get_num_threads();
|
||||
|
||||
LhsScalar* blockA = blocking.blockA();
|
||||
eigen_internal_assert(blockA!=0);
|
||||
|
@ -116,9 +118,9 @@ static void run(Index rows, Index cols, Index depth,
|
|||
info[tid].sync = k;
|
||||
|
||||
// Computes C_i += A' * B' per A'_i
|
||||
for(Index shift=0; shift<threads; ++shift)
|
||||
for(int shift=0; shift<threads; ++shift)
|
||||
{
|
||||
Index i = (tid+shift)%threads;
|
||||
int i = (tid+shift)%threads;
|
||||
|
||||
// At this point we have to make sure that A'_i has been updated by the thread i,
|
||||
// we use testAndSetOrdered to mimic a volatile access.
|
||||
|
@ -226,7 +228,7 @@ struct gemm_functor
|
|||
Gemm::run(rows, cols, m_lhs.cols(),
|
||||
&m_lhs.coeffRef(row,0), m_lhs.outerStride(),
|
||||
&m_rhs.coeffRef(0,col), m_rhs.outerStride(),
|
||||
(Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
|
||||
(Scalar*)&(m_dest.coeffRef(row,col)), m_dest.innerStride(), m_dest.outerStride(),
|
||||
m_actualAlpha, m_blocking, info);
|
||||
}
|
||||
|
||||
|
@ -428,7 +430,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
|||
static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
|
||||
lazyproduct::evalTo(dst, lhs, rhs);
|
||||
lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>());
|
||||
else
|
||||
{
|
||||
dst.setZero();
|
||||
|
@ -440,7 +442,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
|||
static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
|
||||
lazyproduct::addTo(dst, lhs, rhs);
|
||||
lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>());
|
||||
else
|
||||
scaleAndAddTo(dst,lhs, rhs, Scalar(1));
|
||||
}
|
||||
|
@ -449,7 +451,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
|||
static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
|
||||
lazyproduct::subTo(dst, lhs, rhs);
|
||||
lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>());
|
||||
else
|
||||
scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
|
||||
}
|
||||
|
@ -476,7 +478,8 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
|||
Index,
|
||||
LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
|
||||
RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
|
||||
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
|
||||
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,
|
||||
Dest::InnerStrideAtCompileTime>,
|
||||
ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor;
|
||||
|
||||
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
|
||||
|
|
|
@ -25,51 +25,54 @@ namespace internal {
|
|||
**********************************************************************/
|
||||
|
||||
// forward declarations (defined at the end of this file)
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo>
|
||||
struct tribb_kernel;
|
||||
|
||||
/* Optimized matrix-matrix product evaluating only one triangular half */
|
||||
template <typename Index,
|
||||
typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResStorageOrder, int UpLo, int Version = Specialized>
|
||||
int ResStorageOrder, int ResInnerStride, int UpLo, int Version = Specialized>
|
||||
struct general_matrix_matrix_triangular_product;
|
||||
|
||||
// as usual if the result is row major => we transpose the product
|
||||
template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int UpLo, int Version>
|
||||
struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,UpLo,Version>
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride, int UpLo, int Version>
|
||||
struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,UpLo,Version>
|
||||
{
|
||||
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
|
||||
const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride,
|
||||
const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resIncr, Index resStride,
|
||||
const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking)
|
||||
{
|
||||
general_matrix_matrix_triangular_product<Index,
|
||||
RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
|
||||
LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
|
||||
ColMajor, UpLo==Lower?Upper:Lower>
|
||||
::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking);
|
||||
ColMajor, ResInnerStride, UpLo==Lower?Upper:Lower>
|
||||
::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int UpLo, int Version>
|
||||
struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Version>
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride, int UpLo, int Version>
|
||||
struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,UpLo,Version>
|
||||
{
|
||||
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
|
||||
const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride,
|
||||
const RhsScalar* _rhs, Index rhsStride,
|
||||
ResScalar* _res, Index resIncr, Index resStride,
|
||||
const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
|
||||
{
|
||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||
|
||||
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
ResMapper res(_res, resStride, resIncr);
|
||||
|
||||
Index kc = blocking.kc();
|
||||
Index mc = (std::min)(size,blocking.mc());
|
||||
|
@ -87,7 +90,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
|||
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
|
||||
tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
|
||||
tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, ResInnerStride, UpLo> sybb;
|
||||
|
||||
for(Index k2=0; k2<depth; k2+=kc)
|
||||
{
|
||||
|
@ -110,8 +113,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
|||
gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
|
||||
(std::min)(size,i2), alpha, -1, -1, 0, 0);
|
||||
|
||||
|
||||
sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
|
||||
sybb(_res+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
|
||||
|
||||
if (UpLo==Upper)
|
||||
{
|
||||
|
@ -133,7 +135,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
|||
// while the triangular block overlapping the diagonal is evaluated into a
|
||||
// small temporary buffer which is then accumulated into the result using a
|
||||
// triangular traversal.
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo>
|
||||
struct tribb_kernel
|
||||
{
|
||||
typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
|
||||
|
@ -142,13 +144,15 @@ struct tribb_kernel
|
|||
enum {
|
||||
BlockSize = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret
|
||||
};
|
||||
void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
|
||||
void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
|
||||
{
|
||||
typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper;
|
||||
ResMapper res(_res, resStride);
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
|
||||
typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
|
||||
typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned> BufferMapper;
|
||||
ResMapper res(_res, resStride, resIncr);
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel1;
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, BufferMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel2;
|
||||
|
||||
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
|
||||
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer((internal::constructor_without_unaligned_array_assert()));
|
||||
|
||||
// let's process the block per panel of actual_mc x BlockSize,
|
||||
// again, each is split into three parts, etc.
|
||||
|
@ -158,7 +162,7 @@ struct tribb_kernel
|
|||
const RhsScalar* actual_b = blockB+j*depth;
|
||||
|
||||
if(UpLo==Upper)
|
||||
gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
|
||||
gebp_kernel1(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
|
||||
-1, -1, 0, 0);
|
||||
|
||||
// selfadjoint micro block
|
||||
|
@ -166,22 +170,23 @@ struct tribb_kernel
|
|||
Index i = j;
|
||||
buffer.setZero();
|
||||
// 1 - apply the kernel on the temporary buffer
|
||||
gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
|
||||
gebp_kernel2(BufferMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
|
||||
-1, -1, 0, 0);
|
||||
|
||||
// 2 - triangular accumulation
|
||||
for(Index j1=0; j1<actualBlockSize; ++j1)
|
||||
{
|
||||
ResScalar* r = &res(i, j + j1);
|
||||
typename ResMapper::LinearMapper r = res.getLinearMapper(i,j+j1);
|
||||
for(Index i1=UpLo==Lower ? j1 : 0;
|
||||
UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
|
||||
r[i1] += buffer(i1,j1);
|
||||
r(i1) += buffer(i1,j1);
|
||||
}
|
||||
}
|
||||
|
||||
if(UpLo==Lower)
|
||||
{
|
||||
Index i = j+actualBlockSize;
|
||||
gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i,
|
||||
gebp_kernel1(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i,
|
||||
depth, actualBlockSize, alpha, -1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
|
@ -269,10 +274,13 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
|
|||
enum {
|
||||
IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
|
||||
LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0,
|
||||
RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0
|
||||
RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0,
|
||||
SkipDiag = (UpLo&(UnitDiag|ZeroDiag))!=0
|
||||
};
|
||||
|
||||
Index size = mat.cols();
|
||||
if(SkipDiag)
|
||||
size--;
|
||||
Index depth = actualLhs.cols();
|
||||
|
||||
typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,typename Lhs::Scalar,typename Rhs::Scalar,
|
||||
|
@ -283,10 +291,12 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
|
|||
internal::general_matrix_matrix_triangular_product<Index,
|
||||
typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
|
||||
typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
|
||||
IsRowMajor ? RowMajor : ColMajor, UpLo>
|
||||
IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo&(Lower|Upper)>
|
||||
::run(size, depth,
|
||||
&actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(),
|
||||
mat.data(), mat.outerStride(), actualAlpha, blocking);
|
||||
&actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(),
|
||||
&actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(),
|
||||
mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? mat.innerStride() : mat.outerStride() ) : 0),
|
||||
mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -294,6 +304,7 @@ template<typename MatrixType, unsigned int UpLo>
|
|||
template<typename ProductType>
|
||||
TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
|
||||
eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
|
||||
|
||||
general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
|
||||
|
|
|
@ -40,7 +40,7 @@ namespace internal {
|
|||
template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo>
|
||||
struct general_matrix_matrix_rankupdate :
|
||||
general_matrix_matrix_triangular_product<
|
||||
Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {};
|
||||
Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,1,UpLo,BuiltIn> {};
|
||||
|
||||
|
||||
// try to go to BLAS specialization
|
||||
|
@ -48,19 +48,19 @@ struct general_matrix_matrix_rankupdate :
|
|||
template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs, int UpLo> \
|
||||
struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
|
||||
Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \
|
||||
Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,1,UpLo,Specialized> { \
|
||||
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
|
||||
const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \
|
||||
const Scalar* rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \
|
||||
{ \
|
||||
if (lhs==rhs) { \
|
||||
if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \
|
||||
general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
|
||||
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \
|
||||
} else { \
|
||||
general_matrix_matrix_triangular_product<Index, \
|
||||
Scalar, LhsStorageOrder, ConjugateLhs, \
|
||||
Scalar, RhsStorageOrder, ConjugateRhs, \
|
||||
ColMajor, UpLo, BuiltIn> \
|
||||
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \
|
||||
ColMajor, 1, UpLo, BuiltIn> \
|
||||
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resIncr,resStride,alpha,blocking); \
|
||||
} \
|
||||
} \
|
||||
};
|
||||
|
@ -88,7 +88,7 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
|
|||
BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
|
||||
char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \
|
||||
EIGTYPE beta(1); \
|
||||
BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \
|
||||
BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \
|
||||
} \
|
||||
};
|
||||
|
||||
|
@ -125,9 +125,13 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
|
|||
} \
|
||||
};
|
||||
|
||||
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk)
|
||||
EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk)
|
||||
#else
|
||||
EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
|
||||
EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_)
|
||||
#endif
|
||||
|
||||
// TODO hanlde complex cases
|
||||
// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)
|
||||
|
|
|
@ -46,25 +46,27 @@ namespace internal {
|
|||
|
||||
// gemm specialization
|
||||
|
||||
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASPREFIX) \
|
||||
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC) \
|
||||
template< \
|
||||
typename Index, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
|
||||
struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1> \
|
||||
{ \
|
||||
typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \
|
||||
\
|
||||
static void run(Index rows, Index cols, Index depth, \
|
||||
const EIGTYPE* _lhs, Index lhsStride, \
|
||||
const EIGTYPE* _rhs, Index rhsStride, \
|
||||
EIGTYPE* res, Index resStride, \
|
||||
EIGTYPE* res, Index resIncr, Index resStride, \
|
||||
EIGTYPE alpha, \
|
||||
level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
|
||||
GemmParallelInfo<Index>* /*info = 0*/) \
|
||||
{ \
|
||||
using std::conj; \
|
||||
\
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
|
||||
eigen_assert(resIncr == 1); \
|
||||
char transa, transb; \
|
||||
BlasIndex m, n, k, lda, ldb, ldc; \
|
||||
const EIGTYPE *a, *b; \
|
||||
|
@ -100,13 +102,20 @@ static void run(Index rows, Index cols, Index depth, \
|
|||
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
||||
} else b = _rhs; \
|
||||
\
|
||||
BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
}};
|
||||
|
||||
GEMM_SPECIALIZATION(double, d, double, d)
|
||||
GEMM_SPECIALIZATION(float, f, float, s)
|
||||
GEMM_SPECIALIZATION(dcomplex, cd, double, z)
|
||||
GEMM_SPECIALIZATION(scomplex, cf, float, c)
|
||||
#ifdef EIGEN_USE_MKL
|
||||
GEMM_SPECIALIZATION(double, d, double, dgemm)
|
||||
GEMM_SPECIALIZATION(float, f, float, sgemm)
|
||||
GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
|
||||
GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm)
|
||||
#else
|
||||
GEMM_SPECIALIZATION(double, d, double, dgemm_)
|
||||
GEMM_SPECIALIZATION(float, f, float, sgemm_)
|
||||
GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
|
||||
GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
|
||||
#endif
|
||||
|
||||
} // end namespase internal
|
||||
|
||||
|
|
|
@ -183,8 +183,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
|
|||
alignmentPattern = AllAligned;
|
||||
}
|
||||
|
||||
const Index offset1 = (FirstAligned && alignmentStep==1)?3:1;
|
||||
const Index offset3 = (FirstAligned && alignmentStep==1)?1:3;
|
||||
const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
|
||||
const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
|
||||
|
||||
Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
|
||||
for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
|
||||
|
@ -457,8 +457,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,R
|
|||
alignmentPattern = AllAligned;
|
||||
}
|
||||
|
||||
const Index offset1 = (FirstAligned && alignmentStep==1)?3:1;
|
||||
const Index offset3 = (FirstAligned && alignmentStep==1)?1:3;
|
||||
const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
|
||||
const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
|
||||
|
||||
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
|
||||
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
|
||||
|
|
|
@ -85,7 +85,7 @@ EIGEN_BLAS_GEMV_SPECIALIZE(float)
|
|||
EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZE(scomplex)
|
||||
|
||||
#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \
|
||||
#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \
|
||||
template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
|
||||
struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
|
||||
{ \
|
||||
|
@ -113,14 +113,21 @@ static void run( \
|
|||
x_ptr=x_tmp.data(); \
|
||||
incx=1; \
|
||||
} else x_ptr=rhs; \
|
||||
BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
|
||||
BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
|
||||
}\
|
||||
};
|
||||
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c)
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv)
|
||||
#else
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_)
|
||||
#endif
|
||||
|
||||
} // end namespase internal
|
||||
|
||||
|
|
|
@ -17,7 +17,8 @@ namespace internal {
|
|||
/** \internal */
|
||||
inline void manage_multi_threading(Action action, int* v)
|
||||
{
|
||||
static EIGEN_UNUSED int m_maxThreads = -1;
|
||||
static int m_maxThreads = -1;
|
||||
EIGEN_UNUSED_VARIABLE(m_maxThreads);
|
||||
|
||||
if(action==SetAction)
|
||||
{
|
||||
|
@ -75,7 +76,7 @@ template<typename Index> struct GemmParallelInfo
|
|||
{
|
||||
GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
|
||||
|
||||
int volatile sync;
|
||||
Index volatile sync;
|
||||
int volatile users;
|
||||
|
||||
Index lhs_start;
|
||||
|
@ -104,13 +105,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
|
|||
// - the sizes are large enough
|
||||
|
||||
// compute the maximal number of threads from the size of the product:
|
||||
// FIXME this has to be fine tuned
|
||||
// This first heuristic takes into account that the product kernel is fully optimized when working with nr columns at once.
|
||||
Index size = transpose ? rows : cols;
|
||||
Index pb_max_threads = std::max<Index>(1,size / 32);
|
||||
Index pb_max_threads = std::max<Index>(1,size / Functor::Traits::nr);
|
||||
|
||||
// compute the maximal number of threads from the total amount of work:
|
||||
double work = static_cast<double>(rows) * static_cast<double>(cols) *
|
||||
static_cast<double>(depth);
|
||||
double kMinTaskSize = 50000; // Heuristic.
|
||||
double kMinTaskSize = 50000; // FIXME improve this heuristic.
|
||||
pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, work / kMinTaskSize));
|
||||
|
||||
// compute the number of threads we are going to use
|
||||
|
@ -149,8 +151,10 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
|
|||
info[i].lhs_start = r0;
|
||||
info[i].lhs_length = actualBlockRows;
|
||||
|
||||
if(transpose) func(c0, actualBlockCols, 0, rows, info);
|
||||
else func(0, rows, c0, actualBlockCols, info);
|
||||
if(transpose)
|
||||
func(c0, actualBlockCols, 0, rows, info);
|
||||
else
|
||||
func(0, rows, c0, actualBlockCols, info);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -277,20 +277,21 @@ struct symm_pack_rhs
|
|||
template <typename Scalar, typename Index,
|
||||
int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
|
||||
int ResStorageOrder>
|
||||
int ResStorageOrder, int ResInnerStride>
|
||||
struct product_selfadjoint_matrix;
|
||||
|
||||
template <typename Scalar, typename Index,
|
||||
int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs>
|
||||
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor>
|
||||
int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
|
||||
int ResInnerStride>
|
||||
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor,ResInnerStride>
|
||||
{
|
||||
|
||||
static EIGEN_STRONG_INLINE void run(
|
||||
Index rows, Index cols,
|
||||
const Scalar* lhs, Index lhsStride,
|
||||
const Scalar* rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
product_selfadjoint_matrix<Scalar, Index,
|
||||
|
@ -298,33 +299,35 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co
|
|||
RhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs),
|
||||
EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
|
||||
LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs),
|
||||
ColMajor>
|
||||
::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking);
|
||||
ColMajor,ResInnerStride>
|
||||
::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Scalar, typename Index,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs>
|
||||
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride>
|
||||
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>
|
||||
{
|
||||
|
||||
static EIGEN_DONT_INLINE void run(
|
||||
Index rows, Index cols,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
|
||||
};
|
||||
|
||||
template <typename Scalar, typename Index,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs>
|
||||
EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>::run(
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride>
|
||||
EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>::run(
|
||||
Index rows, Index cols,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* _res, Index resStride,
|
||||
Scalar* _res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
Index size = rows;
|
||||
|
@ -334,11 +337,11 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
|||
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
ResMapper res(_res, resStride, resIncr);
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
|
@ -398,26 +401,28 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
|||
// matrix * selfadjoint product
|
||||
template <typename Scalar, typename Index,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs>
|
||||
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride>
|
||||
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>
|
||||
{
|
||||
|
||||
static EIGEN_DONT_INLINE void run(
|
||||
Index rows, Index cols,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
|
||||
};
|
||||
|
||||
template <typename Scalar, typename Index,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs>
|
||||
EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>::run(
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride>
|
||||
EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>::run(
|
||||
Index rows, Index cols,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* _res, Index resStride,
|
||||
Scalar* _res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
Index size = cols;
|
||||
|
@ -425,9 +430,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
|
|||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
|
||||
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
ResMapper res(_res,resStride);
|
||||
ResMapper res(_res,resStride, resIncr);
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
|
@ -503,12 +508,13 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false>
|
|||
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
|
||||
EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
|
||||
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
|
||||
internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor>
|
||||
internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor,
|
||||
Dest::InnerStrideAtCompileTime>
|
||||
::run(
|
||||
lhs.rows(), rhs.cols(), // sizes
|
||||
&lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
|
||||
&rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
|
||||
&dst.coeffRef(0,0), dst.outerStride(), // result info
|
||||
&dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info
|
||||
actualAlpha, blocking // alpha
|
||||
);
|
||||
}
|
||||
|
|
|
@ -40,20 +40,22 @@ namespace internal {
|
|||
|
||||
/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
|
||||
|
||||
#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||
template <typename Index, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
|
||||
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \
|
||||
{\
|
||||
\
|
||||
static void run( \
|
||||
Index rows, Index cols, \
|
||||
const EIGTYPE* _lhs, Index lhsStride, \
|
||||
const EIGTYPE* _rhs, Index rhsStride, \
|
||||
EIGTYPE* res, Index resStride, \
|
||||
EIGTYPE* res, Index resIncr, Index resStride, \
|
||||
EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
|
||||
{ \
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
|
||||
eigen_assert(resIncr == 1); \
|
||||
char side='L', uplo='L'; \
|
||||
BlasIndex m, n, lda, ldb, ldc; \
|
||||
const EIGTYPE *a, *b; \
|
||||
|
@ -81,25 +83,27 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
|
|||
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
||||
} else b = _rhs; \
|
||||
\
|
||||
BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
\
|
||||
} \
|
||||
};
|
||||
|
||||
|
||||
#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||
template <typename Index, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
|
||||
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \
|
||||
{\
|
||||
static void run( \
|
||||
Index rows, Index cols, \
|
||||
const EIGTYPE* _lhs, Index lhsStride, \
|
||||
const EIGTYPE* _rhs, Index rhsStride, \
|
||||
EIGTYPE* res, Index resStride, \
|
||||
EIGTYPE* res, Index resIncr, Index resStride, \
|
||||
EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
|
||||
{ \
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
|
||||
eigen_assert(resIncr == 1); \
|
||||
char side='L', uplo='L'; \
|
||||
BlasIndex m, n, lda, ldb, ldc; \
|
||||
const EIGTYPE *a, *b; \
|
||||
|
@ -144,33 +148,41 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
|
|||
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
||||
} \
|
||||
\
|
||||
BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
\
|
||||
} \
|
||||
};
|
||||
|
||||
EIGEN_BLAS_SYMM_L(double, double, d, d)
|
||||
EIGEN_BLAS_SYMM_L(float, float, f, s)
|
||||
EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z)
|
||||
EIGEN_BLAS_HEMM_L(scomplex, float, cf, c)
|
||||
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_SYMM_L(double, double, d, dsymm)
|
||||
EIGEN_BLAS_SYMM_L(float, float, f, ssymm)
|
||||
EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm)
|
||||
EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm)
|
||||
#else
|
||||
EIGEN_BLAS_SYMM_L(double, double, d, dsymm_)
|
||||
EIGEN_BLAS_SYMM_L(float, float, f, ssymm_)
|
||||
EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_)
|
||||
EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_)
|
||||
#endif
|
||||
|
||||
/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
|
||||
|
||||
#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||
template <typename Index, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
|
||||
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \
|
||||
{\
|
||||
\
|
||||
static void run( \
|
||||
Index rows, Index cols, \
|
||||
const EIGTYPE* _lhs, Index lhsStride, \
|
||||
const EIGTYPE* _rhs, Index rhsStride, \
|
||||
EIGTYPE* res, Index resStride, \
|
||||
EIGTYPE* res, Index resIncr, Index resStride, \
|
||||
EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
|
||||
{ \
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
|
||||
eigen_assert(resIncr == 1); \
|
||||
char side='R', uplo='L'; \
|
||||
BlasIndex m, n, lda, ldb, ldc; \
|
||||
const EIGTYPE *a, *b; \
|
||||
|
@ -197,25 +209,27 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
|
|||
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
||||
} else b = _lhs; \
|
||||
\
|
||||
BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
\
|
||||
} \
|
||||
};
|
||||
|
||||
|
||||
#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||
template <typename Index, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
|
||||
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \
|
||||
{\
|
||||
static void run( \
|
||||
Index rows, Index cols, \
|
||||
const EIGTYPE* _lhs, Index lhsStride, \
|
||||
const EIGTYPE* _rhs, Index rhsStride, \
|
||||
EIGTYPE* res, Index resStride, \
|
||||
EIGTYPE* res, Index resIncr, Index resStride, \
|
||||
EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
|
||||
{ \
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
|
||||
eigen_assert(resIncr == 1); \
|
||||
char side='R', uplo='L'; \
|
||||
BlasIndex m, n, lda, ldb, ldc; \
|
||||
const EIGTYPE *a, *b; \
|
||||
|
@ -259,15 +273,21 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
|
|||
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
||||
} \
|
||||
\
|
||||
BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
} \
|
||||
};
|
||||
|
||||
EIGEN_BLAS_SYMM_R(double, double, d, d)
|
||||
EIGEN_BLAS_SYMM_R(float, float, f, s)
|
||||
EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z)
|
||||
EIGEN_BLAS_HEMM_R(scomplex, float, cf, c)
|
||||
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_SYMM_R(double, double, d, dsymm)
|
||||
EIGEN_BLAS_SYMM_R(float, float, f, ssymm)
|
||||
EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm)
|
||||
EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm)
|
||||
#else
|
||||
EIGEN_BLAS_SYMM_R(double, double, d, dsymm_)
|
||||
EIGEN_BLAS_SYMM_R(float, float, f, ssymm_)
|
||||
EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_)
|
||||
EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_)
|
||||
#endif
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
|
|
@ -83,10 +83,10 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
|
|||
Scalar t3(0);
|
||||
Packet ptmp3 = pset1<Packet>(t3);
|
||||
|
||||
size_t starti = FirstTriangular ? 0 : j+2;
|
||||
size_t endi = FirstTriangular ? j : size;
|
||||
size_t alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti);
|
||||
size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
|
||||
Index starti = FirstTriangular ? 0 : j+2;
|
||||
Index endi = FirstTriangular ? j : size;
|
||||
Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti);
|
||||
Index alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
|
||||
|
||||
res[j] += cjd.pmul(numext::real(A0[j]), t0);
|
||||
res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1);
|
||||
|
@ -101,7 +101,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
|
|||
t2 += cj1.pmul(A0[j+1], rhs[j+1]);
|
||||
}
|
||||
|
||||
for (size_t i=starti; i<alignedStart; ++i)
|
||||
for (Index i=starti; i<alignedStart; ++i)
|
||||
{
|
||||
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
|
||||
t2 += cj1.pmul(A0[i], rhs[i]);
|
||||
|
@ -113,7 +113,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
|
|||
const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
|
||||
const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
|
||||
Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
|
||||
for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize)
|
||||
for (Index i=alignedStart; i<alignedEnd; i+=PacketSize)
|
||||
{
|
||||
Packet A0i = ploadu<Packet>(a0It); a0It += PacketSize;
|
||||
Packet A1i = ploadu<Packet>(a1It); a1It += PacketSize;
|
||||
|
@ -125,7 +125,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
|
|||
ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
|
||||
pstore(resIt,Xi); resIt += PacketSize;
|
||||
}
|
||||
for (size_t i=alignedEnd; i<endi; i++)
|
||||
for (Index i=alignedEnd; i<endi; i++)
|
||||
{
|
||||
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
|
||||
t2 += cj1.pmul(A0[i], rhs[i]);
|
||||
|
|
|
@ -95,14 +95,21 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \
|
|||
x_tmp=map_x.conjugate(); \
|
||||
x_ptr=x_tmp.data(); \
|
||||
} else x_ptr=_rhs; \
|
||||
BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
|
||||
BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
|
||||
}\
|
||||
};
|
||||
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv)
|
||||
EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv)
|
||||
EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
|
||||
EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv)
|
||||
#else
|
||||
EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_)
|
||||
EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_)
|
||||
EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_)
|
||||
EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_)
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
|
|
@ -109,10 +109,10 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
|
|||
internal::general_matrix_matrix_triangular_product<Index,
|
||||
Scalar, OtherIsRowMajor ? RowMajor : ColMajor, OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
|
||||
Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
|
||||
IsRowMajor ? RowMajor : ColMajor, UpLo>
|
||||
IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo>
|
||||
::run(size, depth,
|
||||
&actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(),
|
||||
mat.data(), mat.outerStride(), actualAlpha, blocking);
|
||||
mat.data(), mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -45,22 +45,24 @@ template <typename Scalar, typename Index,
|
|||
int Mode, bool LhsIsTriangular,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResStorageOrder, int Version = Specialized>
|
||||
int ResStorageOrder, int ResInnerStride,
|
||||
int Version = Specialized>
|
||||
struct product_triangular_matrix_matrix;
|
||||
|
||||
template <typename Scalar, typename Index,
|
||||
int Mode, bool LhsIsTriangular,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs, int Version>
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride, int Version>
|
||||
struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
|
||||
LhsStorageOrder,ConjugateLhs,
|
||||
RhsStorageOrder,ConjugateRhs,RowMajor,Version>
|
||||
RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,Version>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(
|
||||
Index rows, Index cols, Index depth,
|
||||
const Scalar* lhs, Index lhsStride,
|
||||
const Scalar* rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
product_triangular_matrix_matrix<Scalar, Index,
|
||||
|
@ -70,18 +72,19 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
|
|||
ConjugateRhs,
|
||||
LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
|
||||
ConjugateLhs,
|
||||
ColMajor>
|
||||
::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking);
|
||||
ColMajor, ResInnerStride>
|
||||
::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking);
|
||||
}
|
||||
};
|
||||
|
||||
// implements col-major += alpha * op(triangular) * op(general)
|
||||
template <typename Scalar, typename Index, int Mode,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs, int Version>
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride, int Version>
|
||||
struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
LhsStorageOrder,ConjugateLhs,
|
||||
RhsStorageOrder,ConjugateRhs,ColMajor,Version>
|
||||
RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>
|
||||
{
|
||||
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
|
@ -95,20 +98,21 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
|||
Index _rows, Index _cols, Index _depth,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
|
||||
};
|
||||
|
||||
template <typename Scalar, typename Index, int Mode,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs, int Version>
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride, int Version>
|
||||
EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
LhsStorageOrder,ConjugateLhs,
|
||||
RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
|
||||
RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
|
||||
Index _rows, Index _cols, Index _depth,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* _res, Index resStride,
|
||||
Scalar* _res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
// strip zeros
|
||||
|
@ -119,10 +123,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
|||
|
||||
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
ResMapper res(_res, resStride, resIncr);
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
|
@ -137,7 +141,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
|||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||
|
||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
|
||||
// To work around an "error: member reference base type 'Matrix<...>
|
||||
// (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is
|
||||
// not a structure or union" compilation error in nvcc (tested V8.0.61),
|
||||
// create a dummy internal::constructor_without_unaligned_array_assert
|
||||
// object to pass to the Matrix constructor.
|
||||
internal::constructor_without_unaligned_array_assert a;
|
||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer(a);
|
||||
triangularBuffer.setZero();
|
||||
if((Mode&ZeroDiag)==ZeroDiag)
|
||||
triangularBuffer.diagonal().setZero();
|
||||
|
@ -229,10 +239,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
|||
// implements col-major += alpha * op(general) * op(triangular)
|
||||
template <typename Scalar, typename Index, int Mode,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs, int Version>
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride, int Version>
|
||||
struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
LhsStorageOrder,ConjugateLhs,
|
||||
RhsStorageOrder,ConjugateRhs,ColMajor,Version>
|
||||
RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>
|
||||
{
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
enum {
|
||||
|
@ -245,20 +256,21 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
|||
Index _rows, Index _cols, Index _depth,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
|
||||
};
|
||||
|
||||
template <typename Scalar, typename Index, int Mode,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs, int Version>
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride, int Version>
|
||||
EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
LhsStorageOrder,ConjugateLhs,
|
||||
RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
|
||||
RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
|
||||
Index _rows, Index _cols, Index _depth,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* _res, Index resStride,
|
||||
Scalar* _res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar);
|
||||
|
@ -270,10 +282,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
|||
|
||||
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
ResMapper res(_res, resStride, resIncr);
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
|
@ -284,7 +296,8 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
|||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||
|
||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
|
||||
internal::constructor_without_unaligned_array_assert a;
|
||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer(a);
|
||||
triangularBuffer.setZero();
|
||||
if((Mode&ZeroDiag)==ZeroDiag)
|
||||
triangularBuffer.diagonal().setZero();
|
||||
|
@ -393,6 +406,8 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
|
|||
{
|
||||
template<typename Dest> static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha)
|
||||
{
|
||||
typedef typename Lhs::Scalar LhsScalar;
|
||||
typedef typename Rhs::Scalar RhsScalar;
|
||||
typedef typename Dest::Scalar Scalar;
|
||||
|
||||
typedef internal::blas_traits<Lhs> LhsBlasTraits;
|
||||
|
@ -405,8 +420,9 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
|
|||
typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
|
||||
typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
|
||||
|
||||
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
|
||||
* RhsBlasTraits::extractScalarFactor(a_rhs);
|
||||
LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs);
|
||||
RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs);
|
||||
Scalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
|
||||
|
||||
typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
|
||||
Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType;
|
||||
|
@ -423,14 +439,29 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
|
|||
Mode, LhsIsTriangular,
|
||||
(internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
|
||||
(internal::traits<ActualRhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
|
||||
(internal::traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor>
|
||||
(internal::traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor, Dest::InnerStrideAtCompileTime>
|
||||
::run(
|
||||
stripedRows, stripedCols, stripedDepth, // sizes
|
||||
&lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
|
||||
&rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
|
||||
&dst.coeffRef(0,0), dst.outerStride(), // result info
|
||||
&dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info
|
||||
actualAlpha, blocking
|
||||
);
|
||||
|
||||
// Apply correction if the diagonal is unit and a scalar factor was nested:
|
||||
if ((Mode&UnitDiag)==UnitDiag)
|
||||
{
|
||||
if (LhsIsTriangular && lhs_alpha!=LhsScalar(1))
|
||||
{
|
||||
Index diagSize = (std::min)(lhs.rows(),lhs.cols());
|
||||
dst.topRows(diagSize) -= ((lhs_alpha-LhsScalar(1))*a_rhs).topRows(diagSize);
|
||||
}
|
||||
else if ((!LhsIsTriangular) && rhs_alpha!=RhsScalar(1))
|
||||
{
|
||||
Index diagSize = (std::min)(rhs.rows(),rhs.cols());
|
||||
dst.leftCols(diagSize) -= (rhs_alpha-RhsScalar(1))*a_lhs.leftCols(diagSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -46,7 +46,7 @@ template <typename Scalar, typename Index,
|
|||
struct product_triangular_matrix_matrix_trmm :
|
||||
product_triangular_matrix_matrix<Scalar,Index,Mode,
|
||||
LhsIsTriangular,LhsStorageOrder,ConjugateLhs,
|
||||
RhsStorageOrder, ConjugateRhs, ResStorageOrder, BuiltIn> {};
|
||||
RhsStorageOrder, ConjugateRhs, ResStorageOrder, 1, BuiltIn> {};
|
||||
|
||||
|
||||
// try to go to BLAS specialization
|
||||
|
@ -55,9 +55,11 @@ template <typename Index, int Mode, \
|
|||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \
|
||||
LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,Specialized> { \
|
||||
LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,1,Specialized> { \
|
||||
static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\
|
||||
const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \
|
||||
const Scalar* _rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
|
||||
eigen_assert(resIncr == 1); \
|
||||
product_triangular_matrix_matrix_trmm<Scalar,Index,Mode, \
|
||||
LhsIsTriangular,LhsStorageOrder,ConjugateLhs, \
|
||||
RhsStorageOrder, ConjugateRhs, ColMajor>::run( \
|
||||
|
@ -75,7 +77,7 @@ EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true)
|
|||
EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false)
|
||||
|
||||
// implements col-major += alpha * op(triangular) * op(general)
|
||||
#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||
template <typename Index, int Mode, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
|
@ -115,8 +117,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
|
|||
if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \
|
||||
/* Most likely no benefit to call TRMM or GEMM from BLAS */ \
|
||||
product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \
|
||||
LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
|
||||
_rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
|
||||
LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \
|
||||
_rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \
|
||||
/*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \
|
||||
} else { \
|
||||
/* Make sense to call GEMM */ \
|
||||
|
@ -124,8 +126,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
|
|||
MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
|
||||
BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
|
||||
gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
|
||||
general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
|
||||
rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \
|
||||
general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \
|
||||
rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, 1, resStride, alpha, gemm_blocking, 0); \
|
||||
\
|
||||
/*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
|
||||
} \
|
||||
|
@ -172,7 +174,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
|
|||
} \
|
||||
/*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \
|
||||
/* call ?trmm*/ \
|
||||
BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
|
||||
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
|
||||
\
|
||||
/* Add op(a_triangular)*b into res*/ \
|
||||
Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
|
||||
|
@ -180,13 +182,20 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
|
|||
} \
|
||||
};
|
||||
|
||||
EIGEN_BLAS_TRMM_L(double, double, d, d)
|
||||
EIGEN_BLAS_TRMM_L(dcomplex, double, cd, z)
|
||||
EIGEN_BLAS_TRMM_L(float, float, f, s)
|
||||
EIGEN_BLAS_TRMM_L(scomplex, float, cf, c)
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_TRMM_L(double, double, d, dtrmm)
|
||||
EIGEN_BLAS_TRMM_L(dcomplex, MKL_Complex16, cd, ztrmm)
|
||||
EIGEN_BLAS_TRMM_L(float, float, f, strmm)
|
||||
EIGEN_BLAS_TRMM_L(scomplex, MKL_Complex8, cf, ctrmm)
|
||||
#else
|
||||
EIGEN_BLAS_TRMM_L(double, double, d, dtrmm_)
|
||||
EIGEN_BLAS_TRMM_L(dcomplex, double, cd, ztrmm_)
|
||||
EIGEN_BLAS_TRMM_L(float, float, f, strmm_)
|
||||
EIGEN_BLAS_TRMM_L(scomplex, float, cf, ctrmm_)
|
||||
#endif
|
||||
|
||||
// implements col-major += alpha * op(general) * op(triangular)
|
||||
#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||
template <typename Index, int Mode, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
|
@ -225,8 +234,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
|
|||
if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \
|
||||
/* Most likely no benefit to call TRMM or GEMM from BLAS*/ \
|
||||
product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \
|
||||
LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
|
||||
_rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
|
||||
LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \
|
||||
_rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \
|
||||
/*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \
|
||||
} else { \
|
||||
/* Make sense to call GEMM */ \
|
||||
|
@ -234,8 +243,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
|
|||
MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
|
||||
BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
|
||||
gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
|
||||
general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
|
||||
rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \
|
||||
general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \
|
||||
rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, 1, resStride, alpha, gemm_blocking, 0); \
|
||||
\
|
||||
/*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
|
||||
} \
|
||||
|
@ -282,7 +291,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
|
|||
} \
|
||||
/*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \
|
||||
/* call ?trmm*/ \
|
||||
BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
|
||||
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
|
||||
\
|
||||
/* Add op(a_triangular)*b into res*/ \
|
||||
Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
|
||||
|
@ -290,11 +299,17 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
|
|||
} \
|
||||
};
|
||||
|
||||
EIGEN_BLAS_TRMM_R(double, double, d, d)
|
||||
EIGEN_BLAS_TRMM_R(dcomplex, double, cd, z)
|
||||
EIGEN_BLAS_TRMM_R(float, float, f, s)
|
||||
EIGEN_BLAS_TRMM_R(scomplex, float, cf, c)
|
||||
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_TRMM_R(double, double, d, dtrmm)
|
||||
EIGEN_BLAS_TRMM_R(dcomplex, MKL_Complex16, cd, ztrmm)
|
||||
EIGEN_BLAS_TRMM_R(float, float, f, strmm)
|
||||
EIGEN_BLAS_TRMM_R(scomplex, MKL_Complex8, cf, ctrmm)
|
||||
#else
|
||||
EIGEN_BLAS_TRMM_R(double, double, d, dtrmm_)
|
||||
EIGEN_BLAS_TRMM_R(dcomplex, double, cd, ztrmm_)
|
||||
EIGEN_BLAS_TRMM_R(float, float, f, strmm_)
|
||||
EIGEN_BLAS_TRMM_R(scomplex, float, cf, ctrmm_)
|
||||
#endif
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
|
|
@ -221,8 +221,9 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
|
|||
typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
|
||||
typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
|
||||
|
||||
ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
|
||||
* RhsBlasTraits::extractScalarFactor(rhs);
|
||||
LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs);
|
||||
RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
|
||||
ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
|
||||
|
||||
enum {
|
||||
// FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
|
||||
|
@ -274,6 +275,12 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
|
|||
else
|
||||
dest = MappedDest(actualDestPtr, dest.size());
|
||||
}
|
||||
|
||||
if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) )
|
||||
{
|
||||
Index diagSize = (std::min)(lhs.rows(),lhs.cols());
|
||||
dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -295,8 +302,9 @@ template<int Mode> struct trmv_selector<Mode,RowMajor>
|
|||
typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
|
||||
typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
|
||||
|
||||
ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
|
||||
* RhsBlasTraits::extractScalarFactor(rhs);
|
||||
LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs);
|
||||
RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
|
||||
ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
|
||||
|
||||
enum {
|
||||
DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
|
||||
|
@ -326,6 +334,12 @@ template<int Mode> struct trmv_selector<Mode,RowMajor>
|
|||
actualRhsPtr,1,
|
||||
dest.data(),dest.innerStride(),
|
||||
actualAlpha);
|
||||
|
||||
if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) )
|
||||
{
|
||||
Index diagSize = (std::min)(lhs.rows(),lhs.cols());
|
||||
dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -71,7 +71,7 @@ EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex)
|
|||
EIGEN_BLAS_TRMV_SPECIALIZE(scomplex)
|
||||
|
||||
// implements col-major: res += alpha * op(triangular) * vector
|
||||
#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
|
||||
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
|
||||
struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
|
||||
enum { \
|
||||
|
@ -121,10 +121,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
|
|||
diag = IsUnitDiag ? 'U' : 'N'; \
|
||||
\
|
||||
/* call ?TRMV*/ \
|
||||
BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
|
||||
BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
|
||||
\
|
||||
/* Add op(a_tr)rhs into res*/ \
|
||||
BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
|
||||
BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
|
||||
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
|
||||
if (size<(std::max)(rows,cols)) { \
|
||||
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
|
||||
|
@ -142,18 +142,25 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
|
|||
m = convert_index<BlasIndex>(size); \
|
||||
n = convert_index<BlasIndex>(cols-size); \
|
||||
} \
|
||||
BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
|
||||
BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
|
||||
} \
|
||||
} \
|
||||
};
|
||||
|
||||
EIGEN_BLAS_TRMV_CM(double, double, d, d)
|
||||
EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z)
|
||||
EIGEN_BLAS_TRMV_CM(float, float, f, s)
|
||||
EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c)
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_TRMV_CM(double, double, d, d,)
|
||||
EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,)
|
||||
EIGEN_BLAS_TRMV_CM(float, float, f, s,)
|
||||
EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c,)
|
||||
#else
|
||||
EIGEN_BLAS_TRMV_CM(double, double, d, d, _)
|
||||
EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _)
|
||||
EIGEN_BLAS_TRMV_CM(float, float, f, s, _)
|
||||
EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _)
|
||||
#endif
|
||||
|
||||
// implements row-major: res += alpha * op(triangular) * vector
|
||||
#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
|
||||
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
|
||||
struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
|
||||
enum { \
|
||||
|
@ -203,10 +210,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
|
|||
diag = IsUnitDiag ? 'U' : 'N'; \
|
||||
\
|
||||
/* call ?TRMV*/ \
|
||||
BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
|
||||
BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
|
||||
\
|
||||
/* Add op(a_tr)rhs into res*/ \
|
||||
BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
|
||||
BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
|
||||
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
|
||||
if (size<(std::max)(rows,cols)) { \
|
||||
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
|
||||
|
@ -224,15 +231,22 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
|
|||
m = convert_index<BlasIndex>(size); \
|
||||
n = convert_index<BlasIndex>(cols-size); \
|
||||
} \
|
||||
BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
|
||||
BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
|
||||
} \
|
||||
} \
|
||||
};
|
||||
|
||||
EIGEN_BLAS_TRMV_RM(double, double, d, d)
|
||||
EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z)
|
||||
EIGEN_BLAS_TRMV_RM(float, float, f, s)
|
||||
EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c)
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_TRMV_RM(double, double, d, d,)
|
||||
EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,)
|
||||
EIGEN_BLAS_TRMV_RM(float, float, f, s,)
|
||||
EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c,)
|
||||
#else
|
||||
EIGEN_BLAS_TRMV_RM(double, double, d, d,_)
|
||||
EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_)
|
||||
EIGEN_BLAS_TRMV_RM(float, float, f, s,_)
|
||||
EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c,_)
|
||||
#endif
|
||||
|
||||
} // end namespase internal
|
||||
|
||||
|
|
|
@ -15,48 +15,48 @@ namespace Eigen {
|
|||
namespace internal {
|
||||
|
||||
// if the rhs is row major, let's transpose the product
|
||||
template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder>
|
||||
struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor>
|
||||
template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
|
||||
struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor,OtherInnerStride>
|
||||
{
|
||||
static void run(
|
||||
Index size, Index cols,
|
||||
const Scalar* tri, Index triStride,
|
||||
Scalar* _other, Index otherStride,
|
||||
Scalar* _other, Index otherIncr, Index otherStride,
|
||||
level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
triangular_solve_matrix<
|
||||
Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft,
|
||||
(Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper),
|
||||
NumTraits<Scalar>::IsComplex && Conjugate,
|
||||
TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor>
|
||||
::run(size, cols, tri, triStride, _other, otherStride, blocking);
|
||||
TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor, OtherInnerStride>
|
||||
::run(size, cols, tri, triStride, _other, otherIncr, otherStride, blocking);
|
||||
}
|
||||
};
|
||||
|
||||
/* Optimized triangular solver with multiple right hand side and the triangular matrix on the left
|
||||
*/
|
||||
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
|
||||
struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>
|
||||
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder,int OtherInnerStride>
|
||||
struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>
|
||||
{
|
||||
static EIGEN_DONT_INLINE void run(
|
||||
Index size, Index otherSize,
|
||||
const Scalar* _tri, Index triStride,
|
||||
Scalar* _other, Index otherStride,
|
||||
Scalar* _other, Index otherIncr, Index otherStride,
|
||||
level3_blocking<Scalar,Scalar>& blocking);
|
||||
};
|
||||
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
|
||||
EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
|
||||
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
|
||||
EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run(
|
||||
Index size, Index otherSize,
|
||||
const Scalar* _tri, Index triStride,
|
||||
Scalar* _other, Index otherStride,
|
||||
Scalar* _other, Index otherIncr, Index otherStride,
|
||||
level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
Index cols = otherSize;
|
||||
|
||||
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
|
||||
typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper;
|
||||
typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> OtherMapper;
|
||||
TriMapper tri(_tri, triStride);
|
||||
OtherMapper other(_other, otherStride);
|
||||
OtherMapper other(_other, otherStride, otherIncr);
|
||||
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
|
||||
|
@ -128,19 +128,19 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
|||
{
|
||||
Scalar b(0);
|
||||
const Scalar* l = &tri(i,s);
|
||||
Scalar* r = &other(s,j);
|
||||
typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j);
|
||||
for (Index i3=0; i3<k; ++i3)
|
||||
b += conj(l[i3]) * r[i3];
|
||||
b += conj(l[i3]) * r(i3);
|
||||
|
||||
other(i,j) = (other(i,j) - b)*a;
|
||||
}
|
||||
else
|
||||
{
|
||||
Scalar b = (other(i,j) *= a);
|
||||
Scalar* r = &other(s,j);
|
||||
const Scalar* l = &tri(s,i);
|
||||
typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j);
|
||||
typename TriMapper::LinearMapper l = tri.getLinearMapper(s,i);
|
||||
for (Index i3=0;i3<rs;++i3)
|
||||
r[i3] -= b * conj(l[i3]);
|
||||
r(i3) -= b * conj(l(i3));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -185,28 +185,28 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
|||
|
||||
/* Optimized triangular solver with multiple left hand sides and the triangular matrix on the right
|
||||
*/
|
||||
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
|
||||
struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>
|
||||
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
|
||||
struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>
|
||||
{
|
||||
static EIGEN_DONT_INLINE void run(
|
||||
Index size, Index otherSize,
|
||||
const Scalar* _tri, Index triStride,
|
||||
Scalar* _other, Index otherStride,
|
||||
Scalar* _other, Index otherIncr, Index otherStride,
|
||||
level3_blocking<Scalar,Scalar>& blocking);
|
||||
};
|
||||
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
|
||||
EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
|
||||
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
|
||||
EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run(
|
||||
Index size, Index otherSize,
|
||||
const Scalar* _tri, Index triStride,
|
||||
Scalar* _other, Index otherStride,
|
||||
Scalar* _other, Index otherIncr, Index otherStride,
|
||||
level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
Index rows = otherSize;
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
|
||||
typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
|
||||
typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> LhsMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
|
||||
LhsMapper lhs(_other, otherStride);
|
||||
LhsMapper lhs(_other, otherStride, otherIncr);
|
||||
RhsMapper rhs(_tri, triStride);
|
||||
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
|
@ -297,24 +297,24 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
|||
{
|
||||
Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k;
|
||||
|
||||
Scalar* r = &lhs(i2,j);
|
||||
typename LhsMapper::LinearMapper r = lhs.getLinearMapper(i2,j);
|
||||
for (Index k3=0; k3<k; ++k3)
|
||||
{
|
||||
Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j));
|
||||
Scalar* a = &lhs(i2,IsLower ? j+1+k3 : absolute_j2+k3);
|
||||
typename LhsMapper::LinearMapper a = lhs.getLinearMapper(i2,IsLower ? j+1+k3 : absolute_j2+k3);
|
||||
for (Index i=0; i<actual_mc; ++i)
|
||||
r[i] -= a[i] * b;
|
||||
r(i) -= a(i) * b;
|
||||
}
|
||||
if((Mode & UnitDiag)==0)
|
||||
{
|
||||
Scalar inv_rjj = RealScalar(1)/conj(rhs(j,j));
|
||||
for (Index i=0; i<actual_mc; ++i)
|
||||
r[i] *= inv_rjj;
|
||||
r(i) *= inv_rjj;
|
||||
}
|
||||
}
|
||||
|
||||
// pack the just computed part of lhs to A
|
||||
pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride),
|
||||
pack_lhs_panel(blockA, lhs.getSubMapper(i2,absolute_j2),
|
||||
actualPanelWidth, actual_mc,
|
||||
actual_kc, j2);
|
||||
}
|
||||
|
|
|
@ -38,9 +38,9 @@ namespace Eigen {
|
|||
namespace internal {
|
||||
|
||||
// implements LeftSide op(triangular)^-1 * general
|
||||
#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \
|
||||
template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
|
||||
struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \
|
||||
struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,1> \
|
||||
{ \
|
||||
enum { \
|
||||
IsLower = (Mode&Lower) == Lower, \
|
||||
|
@ -51,8 +51,10 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage
|
|||
static void run( \
|
||||
Index size, Index otherSize, \
|
||||
const EIGTYPE* _tri, Index triStride, \
|
||||
EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
|
||||
EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
|
||||
{ \
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \
|
||||
eigen_assert(otherIncr == 1); \
|
||||
BlasIndex m = convert_index<BlasIndex>(size), n = convert_index<BlasIndex>(otherSize), lda, ldb; \
|
||||
char side = 'L', uplo, diag='N', transa; \
|
||||
/* Set alpha_ */ \
|
||||
|
@ -80,20 +82,26 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage
|
|||
} \
|
||||
if (IsUnitDiag) diag='U'; \
|
||||
/* call ?trsm*/ \
|
||||
BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
|
||||
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
|
||||
} \
|
||||
};
|
||||
|
||||
EIGEN_BLAS_TRSM_L(double, double, d)
|
||||
EIGEN_BLAS_TRSM_L(dcomplex, double, z)
|
||||
EIGEN_BLAS_TRSM_L(float, float, s)
|
||||
EIGEN_BLAS_TRSM_L(scomplex, float, c)
|
||||
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_TRSM_L(double, double, dtrsm)
|
||||
EIGEN_BLAS_TRSM_L(dcomplex, MKL_Complex16, ztrsm)
|
||||
EIGEN_BLAS_TRSM_L(float, float, strsm)
|
||||
EIGEN_BLAS_TRSM_L(scomplex, MKL_Complex8, ctrsm)
|
||||
#else
|
||||
EIGEN_BLAS_TRSM_L(double, double, dtrsm_)
|
||||
EIGEN_BLAS_TRSM_L(dcomplex, double, ztrsm_)
|
||||
EIGEN_BLAS_TRSM_L(float, float, strsm_)
|
||||
EIGEN_BLAS_TRSM_L(scomplex, float, ctrsm_)
|
||||
#endif
|
||||
|
||||
// implements RightSide general * op(triangular)^-1
|
||||
#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASFUNC) \
|
||||
template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
|
||||
struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \
|
||||
struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,1> \
|
||||
{ \
|
||||
enum { \
|
||||
IsLower = (Mode&Lower) == Lower, \
|
||||
|
@ -104,8 +112,10 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag
|
|||
static void run( \
|
||||
Index size, Index otherSize, \
|
||||
const EIGTYPE* _tri, Index triStride, \
|
||||
EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
|
||||
EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
|
||||
{ \
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \
|
||||
eigen_assert(otherIncr == 1); \
|
||||
BlasIndex m = convert_index<BlasIndex>(otherSize), n = convert_index<BlasIndex>(size), lda, ldb; \
|
||||
char side = 'R', uplo, diag='N', transa; \
|
||||
/* Set alpha_ */ \
|
||||
|
@ -133,16 +143,22 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag
|
|||
} \
|
||||
if (IsUnitDiag) diag='U'; \
|
||||
/* call ?trsm*/ \
|
||||
BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
|
||||
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
|
||||
/*std::cout << "TRMS_L specialization!\n";*/ \
|
||||
} \
|
||||
};
|
||||
|
||||
EIGEN_BLAS_TRSM_R(double, double, d)
|
||||
EIGEN_BLAS_TRSM_R(dcomplex, double, z)
|
||||
EIGEN_BLAS_TRSM_R(float, float, s)
|
||||
EIGEN_BLAS_TRSM_R(scomplex, float, c)
|
||||
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_TRSM_R(double, double, dtrsm)
|
||||
EIGEN_BLAS_TRSM_R(dcomplex, MKL_Complex16, ztrsm)
|
||||
EIGEN_BLAS_TRSM_R(float, float, strsm)
|
||||
EIGEN_BLAS_TRSM_R(scomplex, MKL_Complex8, ctrsm)
|
||||
#else
|
||||
EIGEN_BLAS_TRSM_R(double, double, dtrsm_)
|
||||
EIGEN_BLAS_TRSM_R(dcomplex, double, ztrsm_)
|
||||
EIGEN_BLAS_TRSM_R(float, float, strsm_)
|
||||
EIGEN_BLAS_TRSM_R(scomplex, float, ctrsm_)
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ template<
|
|||
typename Index,
|
||||
typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResStorageOrder>
|
||||
int ResStorageOrder, int ResInnerStride>
|
||||
struct general_matrix_matrix_product;
|
||||
|
||||
template<typename Index,
|
||||
|
@ -155,13 +155,21 @@ class BlasVectorMapper {
|
|||
Scalar* m_data;
|
||||
};
|
||||
|
||||
template<typename Scalar, typename Index, int AlignmentType, int Incr=1>
|
||||
class BlasLinearMapper;
|
||||
|
||||
template<typename Scalar, typename Index, int AlignmentType>
|
||||
class BlasLinearMapper {
|
||||
class BlasLinearMapper<Scalar,Index,AlignmentType,1> {
|
||||
public:
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
typedef typename packet_traits<Scalar>::half HalfPacket;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data, Index incr=1)
|
||||
: m_data(data)
|
||||
{
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(incr);
|
||||
eigen_assert(incr==1);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
|
||||
internal::prefetch(&operator()(i));
|
||||
|
@ -188,8 +196,12 @@ class BlasLinearMapper {
|
|||
};
|
||||
|
||||
// Lightweight helper class to access matrix coefficients.
|
||||
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned>
|
||||
class blas_data_mapper {
|
||||
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned, int Incr = 1>
|
||||
class blas_data_mapper;
|
||||
|
||||
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType>
|
||||
class blas_data_mapper<Scalar,Index,StorageOrder,AlignmentType,1>
|
||||
{
|
||||
public:
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
typedef typename packet_traits<Scalar>::half HalfPacket;
|
||||
|
@ -197,7 +209,12 @@ class blas_data_mapper {
|
|||
typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
|
||||
typedef BlasVectorMapper<Scalar, Index> VectorMapper;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr=1)
|
||||
: m_data(data), m_stride(stride)
|
||||
{
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(incr);
|
||||
eigen_assert(incr==1);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
|
||||
getSubMapper(Index i, Index j) const {
|
||||
|
@ -251,6 +268,90 @@ class blas_data_mapper {
|
|||
const Index m_stride;
|
||||
};
|
||||
|
||||
// Implementation of non-natural increment (i.e. inner-stride != 1)
|
||||
// The exposed API is not complete yet compared to the Incr==1 case
|
||||
// because some features makes less sense in this case.
|
||||
template<typename Scalar, typename Index, int AlignmentType, int Incr>
|
||||
class BlasLinearMapper
|
||||
{
|
||||
public:
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
typedef typename packet_traits<Scalar>::half HalfPacket;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data,Index incr) : m_data(data), m_incr(incr) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
|
||||
internal::prefetch(&operator()(i));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
|
||||
return m_data[i*m_incr.value()];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
|
||||
return pgather<Scalar,Packet>(m_data + i*m_incr.value(), m_incr.value());
|
||||
}
|
||||
|
||||
template<typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
|
||||
pscatter<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value());
|
||||
}
|
||||
|
||||
protected:
|
||||
Scalar *m_data;
|
||||
const internal::variable_if_dynamic<Index,Incr> m_incr;
|
||||
};
|
||||
|
||||
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType,int Incr>
|
||||
class blas_data_mapper
|
||||
{
|
||||
public:
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
typedef typename packet_traits<Scalar>::half HalfPacket;
|
||||
|
||||
typedef BlasLinearMapper<Scalar, Index, AlignmentType,Incr> LinearMapper;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr) : m_data(data), m_stride(stride), m_incr(incr) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper
|
||||
getSubMapper(Index i, Index j) const {
|
||||
return blas_data_mapper(&operator()(i, j), m_stride, m_incr.value());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
|
||||
return LinearMapper(&operator()(i, j), m_incr.value());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
|
||||
return m_data[StorageOrder==RowMajor ? j*m_incr.value() + i*m_stride : i*m_incr.value() + j*m_stride];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
|
||||
return pgather<Scalar,Packet>(&operator()(i, j),m_incr.value());
|
||||
}
|
||||
|
||||
template <typename PacketT, int AlignmentT>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
|
||||
return pgather<Scalar,PacketT>(&operator()(i, j),m_incr.value());
|
||||
}
|
||||
|
||||
template<typename SubPacket>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
|
||||
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
|
||||
}
|
||||
|
||||
template<typename SubPacket>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
|
||||
return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
|
||||
}
|
||||
|
||||
protected:
|
||||
Scalar* EIGEN_RESTRICT m_data;
|
||||
const Index m_stride;
|
||||
const internal::variable_if_dynamic<Index,Incr> m_incr;
|
||||
};
|
||||
|
||||
// lightweight helper class to access matrix coefficients (const version)
|
||||
template<typename Scalar, typename Index, int StorageOrder>
|
||||
class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {
|
||||
|
|
|
@ -43,13 +43,24 @@
|
|||
#endif
|
||||
#pragma clang diagnostic ignored "-Wconstant-logical-operand"
|
||||
|
||||
#elif defined __GNUC__ && __GNUC__>=6
|
||||
#elif defined __GNUC__
|
||||
|
||||
#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
|
||||
#if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
|
||||
#pragma GCC diagnostic push
|
||||
#endif
|
||||
// g++ warns about local variables shadowing member functions, which is too strict
|
||||
#pragma GCC diagnostic ignored "-Wshadow"
|
||||
#if __GNUC__ == 4 && __GNUC_MINOR__ < 8
|
||||
// Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
|
||||
#pragma GCC diagnostic ignored "-Wtype-limits"
|
||||
#endif
|
||||
#if __GNUC__>=6
|
||||
#pragma GCC diagnostic ignored "-Wignored-attributes"
|
||||
|
||||
#endif
|
||||
#if __GNUC__==7
|
||||
// See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
|
||||
#pragma GCC diagnostic ignored "-Wattributes"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined __NVCC__
|
||||
|
@ -72,4 +83,12 @@
|
|||
#pragma diag_suppress 2737
|
||||
#endif
|
||||
|
||||
#else
|
||||
// warnings already disabled:
|
||||
# ifndef EIGEN_WARNINGS_DISABLED_2
|
||||
# define EIGEN_WARNINGS_DISABLED_2
|
||||
# elif defined(EIGEN_INTERNAL_DEBUGGING)
|
||||
# error "Do not include \"DisableStupidWarnings.h\" recursively more than twice!"
|
||||
# endif
|
||||
|
||||
#endif // not EIGEN_WARNINGS_DISABLED
|
||||
|
|
|
@ -47,11 +47,7 @@ template<typename T> struct NumTraits;
|
|||
template<typename Derived> struct EigenBase;
|
||||
template<typename Derived> class DenseBase;
|
||||
template<typename Derived> class PlainObjectBase;
|
||||
|
||||
|
||||
template<typename Derived,
|
||||
int Level = internal::accessors_level<Derived>::value >
|
||||
class DenseCoeffsBase;
|
||||
template<typename Derived, int Level> class DenseCoeffsBase;
|
||||
|
||||
template<typename _Scalar, int _Rows, int _Cols,
|
||||
int _Options = AutoAlign |
|
||||
|
|
|
@ -49,10 +49,11 @@
|
|||
#define EIGEN_USE_LAPACKE
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_USE_MKL_VML)
|
||||
#if defined(EIGEN_USE_MKL_VML) && !defined(EIGEN_USE_MKL)
|
||||
#define EIGEN_USE_MKL
|
||||
#endif
|
||||
|
||||
|
||||
#if defined EIGEN_USE_MKL
|
||||
# include <mkl.h>
|
||||
/*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
|
||||
|
@ -108,6 +109,10 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL)
|
||||
#include "../../misc/blas.h"
|
||||
#endif
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
typedef std::complex<double> dcomplex;
|
||||
|
@ -121,8 +126,5 @@ typedef int BlasIndex;
|
|||
|
||||
} // end namespace Eigen
|
||||
|
||||
#if defined(EIGEN_USE_BLAS)
|
||||
#include "../../misc/blas.h"
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_MKL_SUPPORT_H
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
#define EIGEN_WORLD_VERSION 3
|
||||
#define EIGEN_MAJOR_VERSION 3
|
||||
#define EIGEN_MINOR_VERSION 2
|
||||
#define EIGEN_MINOR_VERSION 9
|
||||
|
||||
#define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
|
||||
(EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
|
||||
|
@ -80,8 +80,8 @@
|
|||
// 2015 14 1900
|
||||
// "15" 15 1900
|
||||
|
||||
/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC
|
||||
#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC)
|
||||
/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl
|
||||
#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG)
|
||||
#define EIGEN_COMP_MSVC_STRICT _MSC_VER
|
||||
#else
|
||||
#define EIGEN_COMP_MSVC_STRICT 0
|
||||
|
@ -380,7 +380,8 @@
|
|||
#if EIGEN_MAX_CPP_VER>=11 && \
|
||||
((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \
|
||||
|| (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \
|
||||
|| (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)))
|
||||
|| (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) \
|
||||
|| (EIGEN_COMP_MSVC >= 1900) )
|
||||
#define EIGEN_HAS_C99_MATH 1
|
||||
#else
|
||||
#define EIGEN_HAS_C99_MATH 0
|
||||
|
@ -396,10 +397,24 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
// Does the compiler support type_traits?
|
||||
// - full support of type traits was added only to GCC 5.1.0.
|
||||
// - 20150626 corresponds to the last release of 4.x libstdc++
|
||||
#ifndef EIGEN_HAS_TYPE_TRAITS
|
||||
#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700) \
|
||||
&& ((!EIGEN_COMP_GNUC_STRICT) || EIGEN_GNUC_AT_LEAST(5, 1)) \
|
||||
&& ((!defined(__GLIBCXX__)) || __GLIBCXX__ > 20150626)
|
||||
#define EIGEN_HAS_TYPE_TRAITS 1
|
||||
#define EIGEN_INCLUDE_TYPE_TRAITS
|
||||
#else
|
||||
#define EIGEN_HAS_TYPE_TRAITS 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Does the compiler support variadic templates?
|
||||
#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
|
||||
&& ( !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) )
|
||||
&& (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) )
|
||||
// ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
|
||||
// this prevents nvcc from crashing when compiling Eigen on Tegra X1
|
||||
#define EIGEN_HAS_VARIADIC_TEMPLATES 1
|
||||
|
@ -413,7 +428,7 @@
|
|||
|
||||
#ifdef __CUDACC__
|
||||
// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
|
||||
#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500))
|
||||
#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500))
|
||||
#define EIGEN_HAS_CONSTEXPR 1
|
||||
#endif
|
||||
#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
|
||||
|
@ -487,11 +502,13 @@
|
|||
// EIGEN_STRONG_INLINE is a stronger version of the inline, using __forceinline on MSVC,
|
||||
// but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline
|
||||
// but GCC is still doing fine with just inline.
|
||||
#ifndef EIGEN_STRONG_INLINE
|
||||
#if EIGEN_COMP_MSVC || EIGEN_COMP_ICC
|
||||
#define EIGEN_STRONG_INLINE __forceinline
|
||||
#else
|
||||
#define EIGEN_STRONG_INLINE inline
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// EIGEN_ALWAYS_INLINE is the stronget, it has the effect of making the function inline and adding every possible
|
||||
// attribute to maximize inlining. This should only be used when really necessary: in particular,
|
||||
|
@ -812,7 +829,8 @@ namespace Eigen {
|
|||
// just an empty macro !
|
||||
#define EIGEN_EMPTY
|
||||
|
||||
#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || __CUDACC_VER__) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
|
||||
#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0)
|
||||
// for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
|
||||
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
|
||||
using Base::operator =;
|
||||
#elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
|
||||
|
@ -832,11 +850,48 @@ namespace Eigen {
|
|||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* \internal
|
||||
* \brief Macro to explicitly define the default copy constructor.
|
||||
* This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden.
|
||||
*/
|
||||
#if EIGEN_HAS_CXX11
|
||||
#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default;
|
||||
#else
|
||||
#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS)
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/** \internal
|
||||
* \brief Macro to manually inherit assignment operators.
|
||||
* This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
|
||||
* With C++11 or later this also default-implements the copy-constructor
|
||||
*/
|
||||
#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
|
||||
#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
|
||||
EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
|
||||
EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived)
|
||||
|
||||
/** \internal
|
||||
* \brief Macro to manually define default constructors and destructors.
|
||||
* This is necessary when the copy constructor is re-defined.
|
||||
* For empty helper classes this should usually be protected, to avoid accidentally creating empty objects.
|
||||
*
|
||||
* Hiding the default destructor lead to problems in C++03 mode together with boost::multiprecision
|
||||
*/
|
||||
#if EIGEN_HAS_CXX11
|
||||
#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \
|
||||
EIGEN_DEVICE_FUNC Derived() = default; \
|
||||
EIGEN_DEVICE_FUNC ~Derived() = default;
|
||||
#else
|
||||
#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \
|
||||
EIGEN_DEVICE_FUNC Derived() {}; \
|
||||
/* EIGEN_DEVICE_FUNC ~Derived() {}; */
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Just a side note. Commenting within defines works only by documenting
|
||||
|
@ -986,7 +1041,13 @@ namespace Eigen {
|
|||
# define EIGEN_NOEXCEPT
|
||||
# define EIGEN_NOEXCEPT_IF(x)
|
||||
# define EIGEN_NO_THROW throw()
|
||||
# if EIGEN_COMP_MSVC
|
||||
// MSVC does not support exception specifications (warning C4290),
|
||||
// and they are deprecated in c++11 anyway.
|
||||
# define EIGEN_EXCEPTION_SPEC(X) throw()
|
||||
# else
|
||||
# define EIGEN_EXCEPTION_SPEC(X) throw(X)
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_MACROS_H
|
||||
|
|
|
@ -70,7 +70,7 @@ inline void throw_std_bad_alloc()
|
|||
throw std::bad_alloc();
|
||||
#else
|
||||
std::size_t huge = static_cast<std::size_t>(-1);
|
||||
new int[huge];
|
||||
::operator new(huge);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -150,7 +150,7 @@ EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
|
|||
/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
|
||||
* On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
|
||||
*/
|
||||
EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size)
|
||||
EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
|
||||
{
|
||||
check_that_malloc_is_allowed();
|
||||
|
||||
|
@ -185,7 +185,7 @@ EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
|
|||
* \brief Reallocates an aligned block of memory.
|
||||
* \throws std::bad_alloc on allocation failure
|
||||
*/
|
||||
inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
|
||||
inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size)
|
||||
{
|
||||
EIGEN_UNUSED_VARIABLE(old_size);
|
||||
|
||||
|
@ -209,12 +209,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
|
|||
/** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
|
||||
* On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
|
||||
*/
|
||||
template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size)
|
||||
template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std::size_t size)
|
||||
{
|
||||
return aligned_malloc(size);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(size_t size)
|
||||
template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std::size_t size)
|
||||
{
|
||||
check_that_malloc_is_allowed();
|
||||
|
||||
|
@ -235,12 +235,12 @@ template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *p
|
|||
std::free(ptr);
|
||||
}
|
||||
|
||||
template<bool Align> inline void* conditional_aligned_realloc(void* ptr, size_t new_size, size_t old_size)
|
||||
template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size)
|
||||
{
|
||||
return aligned_realloc(ptr, new_size, old_size);
|
||||
}
|
||||
|
||||
template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new_size, size_t)
|
||||
template<> inline void* conditional_aligned_realloc<false>(void* ptr, std::size_t new_size, std::size_t)
|
||||
{
|
||||
return std::realloc(ptr, new_size);
|
||||
}
|
||||
|
@ -252,7 +252,7 @@ template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new
|
|||
/** \internal Destructs the elements of an array.
|
||||
* The \a size parameters tells on how many objects to call the destructor of T.
|
||||
*/
|
||||
template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, size_t size)
|
||||
template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, std::size_t size)
|
||||
{
|
||||
// always destruct an array starting from the end.
|
||||
if(ptr)
|
||||
|
@ -262,9 +262,9 @@ template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T
|
|||
/** \internal Constructs the elements of an array.
|
||||
* The \a size parameter tells on how many objects to call the constructor of T.
|
||||
*/
|
||||
template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, size_t size)
|
||||
template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, std::size_t size)
|
||||
{
|
||||
size_t i;
|
||||
std::size_t i;
|
||||
EIGEN_TRY
|
||||
{
|
||||
for (i = 0; i < size; ++i) ::new (ptr + i) T;
|
||||
|
@ -283,9 +283,9 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *
|
|||
*****************************************************************************/
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t size)
|
||||
{
|
||||
if(size > size_t(-1) / sizeof(T))
|
||||
if(size > std::size_t(-1) / sizeof(T))
|
||||
throw_std_bad_alloc();
|
||||
}
|
||||
|
||||
|
@ -293,7 +293,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
|
|||
* On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown.
|
||||
* The default constructor of T is called.
|
||||
*/
|
||||
template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
|
||||
template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size)
|
||||
{
|
||||
check_size_for_overflow<T>(size);
|
||||
T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
|
||||
|
@ -309,7 +309,7 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
|
|||
return result;
|
||||
}
|
||||
|
||||
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size)
|
||||
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(std::size_t size)
|
||||
{
|
||||
check_size_for_overflow<T>(size);
|
||||
T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
|
||||
|
@ -328,7 +328,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
|
|||
/** \internal Deletes objects constructed with aligned_new
|
||||
* The \a size parameters tells on how many objects to call the destructor of T.
|
||||
*/
|
||||
template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t size)
|
||||
template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, std::size_t size)
|
||||
{
|
||||
destruct_elements_of_array<T>(ptr, size);
|
||||
aligned_free(ptr);
|
||||
|
@ -337,13 +337,13 @@ template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t
|
|||
/** \internal Deletes objects constructed with conditional_aligned_new
|
||||
* The \a size parameters tells on how many objects to call the destructor of T.
|
||||
*/
|
||||
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, size_t size)
|
||||
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, std::size_t size)
|
||||
{
|
||||
destruct_elements_of_array<T>(ptr, size);
|
||||
conditional_aligned_free<Align>(ptr);
|
||||
}
|
||||
|
||||
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
|
||||
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, std::size_t new_size, std::size_t old_size)
|
||||
{
|
||||
check_size_for_overflow<T>(new_size);
|
||||
check_size_for_overflow<T>(old_size);
|
||||
|
@ -366,7 +366,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
|
|||
}
|
||||
|
||||
|
||||
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(size_t size)
|
||||
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(std::size_t size)
|
||||
{
|
||||
if(size==0)
|
||||
return 0; // short-cut. Also fixes Bug 884
|
||||
|
@ -387,7 +387,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
|
|||
return result;
|
||||
}
|
||||
|
||||
template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(T* pts, size_t new_size, size_t old_size)
|
||||
template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size)
|
||||
{
|
||||
check_size_for_overflow<T>(new_size);
|
||||
check_size_for_overflow<T>(old_size);
|
||||
|
@ -409,7 +409,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(
|
|||
return result;
|
||||
}
|
||||
|
||||
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, size_t size)
|
||||
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, std::size_t size)
|
||||
{
|
||||
if(NumTraits<T>::RequireInitialization)
|
||||
destruct_elements_of_array<T>(ptr, size);
|
||||
|
@ -493,7 +493,7 @@ template<typename T> struct smart_copy_helper<T,true> {
|
|||
IntPtr size = IntPtr(end)-IntPtr(start);
|
||||
if(size==0) return;
|
||||
eigen_internal_assert(start!=0 && end!=0 && target!=0);
|
||||
memcpy(target, start, size);
|
||||
std::memcpy(target, start, size);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -561,7 +561,7 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
|
|||
* In this case, the buffer elements will also be destructed when this handler will be destructed.
|
||||
* Finally, if \a dealloc is true, then the pointer \a ptr is freed.
|
||||
**/
|
||||
aligned_stack_memory_handler(T* ptr, size_t size, bool dealloc)
|
||||
aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc)
|
||||
: m_ptr(ptr), m_size(size), m_deallocate(dealloc)
|
||||
{
|
||||
if(NumTraits<T>::RequireInitialization && m_ptr)
|
||||
|
@ -576,7 +576,7 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
|
|||
}
|
||||
protected:
|
||||
T* m_ptr;
|
||||
size_t m_size;
|
||||
std::size_t m_size;
|
||||
bool m_deallocate;
|
||||
};
|
||||
|
||||
|
@ -655,15 +655,15 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
|
|||
|
||||
#if EIGEN_MAX_ALIGN_BYTES!=0
|
||||
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
|
||||
void* operator new(size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \
|
||||
void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \
|
||||
EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
|
||||
EIGEN_CATCH (...) { return 0; } \
|
||||
}
|
||||
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \
|
||||
void *operator new(size_t size) { \
|
||||
void *operator new(std::size_t size) { \
|
||||
return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
|
||||
} \
|
||||
void *operator new[](size_t size) { \
|
||||
void *operator new[](std::size_t size) { \
|
||||
return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
|
||||
} \
|
||||
void operator delete(void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
|
||||
|
@ -673,8 +673,8 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
|
|||
/* in-place new and delete. since (at least afaik) there is no actual */ \
|
||||
/* memory allocated we can safely let the default implementation handle */ \
|
||||
/* this particular case. */ \
|
||||
static void *operator new(size_t size, void *ptr) { return ::operator new(size,ptr); } \
|
||||
static void *operator new[](size_t size, void* ptr) { return ::operator new[](size,ptr); } \
|
||||
static void *operator new(std::size_t size, void *ptr) { return ::operator new(size,ptr); } \
|
||||
static void *operator new[](std::size_t size, void* ptr) { return ::operator new[](size,ptr); } \
|
||||
void operator delete(void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete(memory,ptr); } \
|
||||
void operator delete[](void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete[](memory,ptr); } \
|
||||
/* nothrow-new (returns zero instead of std::bad_alloc) */ \
|
||||
|
@ -696,7 +696,15 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
|
|||
/** \class aligned_allocator
|
||||
* \ingroup Core_Module
|
||||
*
|
||||
* \brief STL compatible allocator to use with with 16 byte aligned types
|
||||
* \brief STL compatible allocator to use with types requiring a non standrad alignment.
|
||||
*
|
||||
* The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd.
|
||||
* By default, it will thus provide at least 16 bytes alignment and more in following cases:
|
||||
* - 32 bytes alignment if AVX is enabled.
|
||||
* - 64 bytes alignment if AVX512 is enabled.
|
||||
*
|
||||
* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented
|
||||
* \link TopicPreprocessorDirectivesPerformance there \endlink.
|
||||
*
|
||||
* Example:
|
||||
* \code
|
||||
|
@ -713,7 +721,7 @@ template<class T>
|
|||
class aligned_allocator : public std::allocator<T>
|
||||
{
|
||||
public:
|
||||
typedef size_t size_type;
|
||||
typedef std::size_t size_type;
|
||||
typedef std::ptrdiff_t difference_type;
|
||||
typedef T* pointer;
|
||||
typedef const T* const_pointer;
|
||||
|
@ -739,7 +747,15 @@ public:
|
|||
pointer allocate(size_type num, const void* /*hint*/ = 0)
|
||||
{
|
||||
internal::check_size_for_overflow<T>(num);
|
||||
return static_cast<pointer>( internal::aligned_malloc(num * sizeof(T)) );
|
||||
size_type size = num * sizeof(T);
|
||||
#if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0)
|
||||
// workaround gcc bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544
|
||||
// It triggered eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807
|
||||
if(size>=std::size_t((std::numeric_limits<std::ptrdiff_t>::max)()))
|
||||
return 0;
|
||||
else
|
||||
#endif
|
||||
return static_cast<pointer>( internal::aligned_malloc(size) );
|
||||
}
|
||||
|
||||
void deallocate(pointer p, size_type /*num*/)
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue