add unsupported eigen package
This commit is contained in:
parent
a85534cc46
commit
d3d496848d
|
@ -8,8 +8,8 @@ Current Eigen Version 3.3.9 (04.12.2020) updated on 15/06/2021
|
|||
To update the lib:
|
||||
- download Eigen
|
||||
- unzip it somewhere
|
||||
- delete (in the filesystem) the content of the folder eigenlib/Eigen - copy the folders 'Eigen' there
|
||||
- execute the two following shell commands in the folder Eigen
|
||||
- delete (in the filesystem) the content of the folder eigenlib/Eigen - copy the folders 'Eigen' and 'unsupported' there
|
||||
- execute the two following shell commands in the folder 'Eigen' and 'unsupported'
|
||||
|
||||
grep -RiIl 'http://mozilla.org/MPL/2.0/.' * | xargs sed -i 's/http:\/\/mozilla.org\/MPL\/2.0\/./the mozilla.org home page/g'
|
||||
grep -RiIl 'http' * | xargs sed -i 's/http/xxxp/g'
|
||||
|
|
|
@ -0,0 +1,156 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_ADLOC_FORWARD
|
||||
#define EIGEN_ADLOC_FORWARD
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// This file provides support for adolc's adouble type in forward mode.
|
||||
// ADOL-C is a C++ automatic differentiation library,
|
||||
// see xxxps://projects.coin-or.org/ADOL-C for more information.
|
||||
//
|
||||
// Note that the maximal number of directions is controlled by
|
||||
// the preprocessor token NUMBER_DIRECTIONS. The default is 2.
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
|
||||
#define ADOLC_TAPELESS
|
||||
#ifndef NUMBER_DIRECTIONS
|
||||
# define NUMBER_DIRECTIONS 2
|
||||
#endif
|
||||
#include <adolc/adtl.h>
|
||||
|
||||
// adolc defines some very stupid macros:
|
||||
#if defined(malloc)
|
||||
# undef malloc
|
||||
#endif
|
||||
|
||||
#if defined(calloc)
|
||||
# undef calloc
|
||||
#endif
|
||||
|
||||
#if defined(realloc)
|
||||
# undef realloc
|
||||
#endif
|
||||
|
||||
#include <Eigen/Core>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/**
|
||||
* \defgroup AdolcForward_Module Adolc forward module
|
||||
* This module provides support for adolc's adouble type in forward mode.
|
||||
* ADOL-C is a C++ automatic differentiation library,
|
||||
* see xxxps://projects.coin-or.org/ADOL-C for more information.
|
||||
* It mainly consists in:
|
||||
* - a struct Eigen::NumTraits<adtl::adouble> specialization
|
||||
* - overloads of internal::* math function for adtl::adouble type.
|
||||
*
|
||||
* Note that the maximal number of directions is controlled by
|
||||
* the preprocessor token NUMBER_DIRECTIONS. The default is 2.
|
||||
*
|
||||
* \code
|
||||
* #include <unsupported/Eigen/AdolcSupport>
|
||||
* \endcode
|
||||
*/
|
||||
//@{
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
// Eigen's require a few additional functions which must be defined in the same namespace
|
||||
// than the custom scalar type own namespace
|
||||
namespace adtl {
|
||||
|
||||
inline const adouble& conj(const adouble& x) { return x; }
|
||||
inline const adouble& real(const adouble& x) { return x; }
|
||||
inline adouble imag(const adouble&) { return 0.; }
|
||||
inline adouble abs(const adouble& x) { return fabs(x); }
|
||||
inline adouble abs2(const adouble& x) { return x*x; }
|
||||
|
||||
}
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
template<> struct NumTraits<adtl::adouble>
|
||||
: NumTraits<double>
|
||||
{
|
||||
typedef adtl::adouble Real;
|
||||
typedef adtl::adouble NonInteger;
|
||||
typedef adtl::adouble Nested;
|
||||
enum {
|
||||
IsComplex = 0,
|
||||
IsInteger = 0,
|
||||
IsSigned = 1,
|
||||
RequireInitialization = 1,
|
||||
ReadCost = 1,
|
||||
AddCost = 1,
|
||||
MulCost = 1
|
||||
};
|
||||
};
|
||||
|
||||
template<typename Functor> class AdolcForwardJacobian : public Functor
|
||||
{
|
||||
typedef adtl::adouble ActiveScalar;
|
||||
public:
|
||||
|
||||
AdolcForwardJacobian() : Functor() {}
|
||||
AdolcForwardJacobian(const Functor& f) : Functor(f) {}
|
||||
|
||||
// forward constructors
|
||||
template<typename T0>
|
||||
AdolcForwardJacobian(const T0& a0) : Functor(a0) {}
|
||||
template<typename T0, typename T1>
|
||||
AdolcForwardJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
|
||||
template<typename T0, typename T1, typename T2>
|
||||
AdolcForwardJacobian(const T0& a0, const T1& a1, const T1& a2) : Functor(a0, a1, a2) {}
|
||||
|
||||
typedef typename Functor::InputType InputType;
|
||||
typedef typename Functor::ValueType ValueType;
|
||||
typedef typename Functor::JacobianType JacobianType;
|
||||
|
||||
typedef Matrix<ActiveScalar, InputType::SizeAtCompileTime, 1> ActiveInput;
|
||||
typedef Matrix<ActiveScalar, ValueType::SizeAtCompileTime, 1> ActiveValue;
|
||||
|
||||
void operator() (const InputType& x, ValueType* v, JacobianType* _jac) const
|
||||
{
|
||||
eigen_assert(v!=0);
|
||||
if (!_jac)
|
||||
{
|
||||
Functor::operator()(x, v);
|
||||
return;
|
||||
}
|
||||
|
||||
JacobianType& jac = *_jac;
|
||||
|
||||
ActiveInput ax = x.template cast<ActiveScalar>();
|
||||
ActiveValue av(jac.rows());
|
||||
|
||||
for (int j=0; j<jac.cols(); j++)
|
||||
for (int i=0; i<jac.cols(); i++)
|
||||
ax[i].setADValue(j, i==j ? 1 : 0);
|
||||
|
||||
Functor::operator()(ax, &av);
|
||||
|
||||
for (int i=0; i<jac.rows(); i++)
|
||||
{
|
||||
(*v)[i] = av[i].getValue();
|
||||
for (int j=0; j<jac.cols(); j++)
|
||||
jac.coeffRef(i,j) = av[i].getADValue(j);
|
||||
}
|
||||
}
|
||||
protected:
|
||||
|
||||
};
|
||||
|
||||
//@}
|
||||
|
||||
}
|
||||
|
||||
#endif // EIGEN_ADLOC_FORWARD
|
|
@ -0,0 +1,224 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_ALIGNED_VECTOR3
|
||||
#define EIGEN_ALIGNED_VECTOR3
|
||||
|
||||
#include <Eigen/Geometry>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/**
|
||||
* \defgroup AlignedVector3_Module Aligned vector3 module
|
||||
*
|
||||
* \code
|
||||
* #include <unsupported/Eigen/AlignedVector3>
|
||||
* \endcode
|
||||
*/
|
||||
//@{
|
||||
|
||||
|
||||
/** \class AlignedVector3
|
||||
*
|
||||
* \brief A vectorization friendly 3D vector
|
||||
*
|
||||
* This class represents a 3D vector internally using a 4D vector
|
||||
* such that vectorization can be seamlessly enabled. Of course,
|
||||
* the same result can be achieved by directly using a 4D vector.
|
||||
* This class makes this process simpler.
|
||||
*
|
||||
*/
|
||||
// TODO specialize Cwise
|
||||
template<typename _Scalar> class AlignedVector3;
|
||||
|
||||
namespace internal {
|
||||
template<typename _Scalar> struct traits<AlignedVector3<_Scalar> >
|
||||
: traits<Matrix<_Scalar,3,1,0,4,1> >
|
||||
{
|
||||
};
|
||||
}
|
||||
|
||||
template<typename _Scalar> class AlignedVector3
|
||||
: public MatrixBase<AlignedVector3<_Scalar> >
|
||||
{
|
||||
typedef Matrix<_Scalar,4,1> CoeffType;
|
||||
CoeffType m_coeffs;
|
||||
public:
|
||||
|
||||
typedef MatrixBase<AlignedVector3<_Scalar> > Base;
|
||||
EIGEN_DENSE_PUBLIC_INTERFACE(AlignedVector3)
|
||||
using Base::operator*;
|
||||
|
||||
inline Index rows() const { return 3; }
|
||||
inline Index cols() const { return 1; }
|
||||
|
||||
Scalar* data() { return m_coeffs.data(); }
|
||||
const Scalar* data() const { return m_coeffs.data(); }
|
||||
Index innerStride() const { return 1; }
|
||||
Index outerStride() const { return 3; }
|
||||
|
||||
inline const Scalar& coeff(Index row, Index col) const
|
||||
{ return m_coeffs.coeff(row, col); }
|
||||
|
||||
inline Scalar& coeffRef(Index row, Index col)
|
||||
{ return m_coeffs.coeffRef(row, col); }
|
||||
|
||||
inline const Scalar& coeff(Index index) const
|
||||
{ return m_coeffs.coeff(index); }
|
||||
|
||||
inline Scalar& coeffRef(Index index)
|
||||
{ return m_coeffs.coeffRef(index);}
|
||||
|
||||
|
||||
inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z)
|
||||
: m_coeffs(x, y, z, Scalar(0))
|
||||
{}
|
||||
|
||||
inline AlignedVector3(const AlignedVector3& other)
|
||||
: Base(), m_coeffs(other.m_coeffs)
|
||||
{}
|
||||
|
||||
template<typename XprType, int Size=XprType::SizeAtCompileTime>
|
||||
struct generic_assign_selector {};
|
||||
|
||||
template<typename XprType> struct generic_assign_selector<XprType,4>
|
||||
{
|
||||
inline static void run(AlignedVector3& dest, const XprType& src)
|
||||
{
|
||||
dest.m_coeffs = src;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename XprType> struct generic_assign_selector<XprType,3>
|
||||
{
|
||||
inline static void run(AlignedVector3& dest, const XprType& src)
|
||||
{
|
||||
dest.m_coeffs.template head<3>() = src;
|
||||
dest.m_coeffs.w() = Scalar(0);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Derived>
|
||||
inline AlignedVector3(const MatrixBase<Derived>& other)
|
||||
{
|
||||
generic_assign_selector<Derived>::run(*this,other.derived());
|
||||
}
|
||||
|
||||
inline AlignedVector3& operator=(const AlignedVector3& other)
|
||||
{ m_coeffs = other.m_coeffs; return *this; }
|
||||
|
||||
template <typename Derived>
|
||||
inline AlignedVector3& operator=(const MatrixBase<Derived>& other)
|
||||
{
|
||||
generic_assign_selector<Derived>::run(*this,other.derived());
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline AlignedVector3 operator+(const AlignedVector3& other) const
|
||||
{ return AlignedVector3(m_coeffs + other.m_coeffs); }
|
||||
|
||||
inline AlignedVector3& operator+=(const AlignedVector3& other)
|
||||
{ m_coeffs += other.m_coeffs; return *this; }
|
||||
|
||||
inline AlignedVector3 operator-(const AlignedVector3& other) const
|
||||
{ return AlignedVector3(m_coeffs - other.m_coeffs); }
|
||||
|
||||
inline AlignedVector3 operator-=(const AlignedVector3& other)
|
||||
{ m_coeffs -= other.m_coeffs; return *this; }
|
||||
|
||||
inline AlignedVector3 operator*(const Scalar& s) const
|
||||
{ return AlignedVector3(m_coeffs * s); }
|
||||
|
||||
inline friend AlignedVector3 operator*(const Scalar& s,const AlignedVector3& vec)
|
||||
{ return AlignedVector3(s * vec.m_coeffs); }
|
||||
|
||||
inline AlignedVector3& operator*=(const Scalar& s)
|
||||
{ m_coeffs *= s; return *this; }
|
||||
|
||||
inline AlignedVector3 operator/(const Scalar& s) const
|
||||
{ return AlignedVector3(m_coeffs / s); }
|
||||
|
||||
inline AlignedVector3& operator/=(const Scalar& s)
|
||||
{ m_coeffs /= s; return *this; }
|
||||
|
||||
inline Scalar dot(const AlignedVector3& other) const
|
||||
{
|
||||
eigen_assert(m_coeffs.w()==Scalar(0));
|
||||
eigen_assert(other.m_coeffs.w()==Scalar(0));
|
||||
return m_coeffs.dot(other.m_coeffs);
|
||||
}
|
||||
|
||||
inline void normalize()
|
||||
{
|
||||
m_coeffs /= norm();
|
||||
}
|
||||
|
||||
inline AlignedVector3 normalized() const
|
||||
{
|
||||
return AlignedVector3(m_coeffs / norm());
|
||||
}
|
||||
|
||||
inline Scalar sum() const
|
||||
{
|
||||
eigen_assert(m_coeffs.w()==Scalar(0));
|
||||
return m_coeffs.sum();
|
||||
}
|
||||
|
||||
inline Scalar squaredNorm() const
|
||||
{
|
||||
eigen_assert(m_coeffs.w()==Scalar(0));
|
||||
return m_coeffs.squaredNorm();
|
||||
}
|
||||
|
||||
inline Scalar norm() const
|
||||
{
|
||||
using std::sqrt;
|
||||
return sqrt(squaredNorm());
|
||||
}
|
||||
|
||||
inline AlignedVector3 cross(const AlignedVector3& other) const
|
||||
{
|
||||
return AlignedVector3(m_coeffs.cross3(other.m_coeffs));
|
||||
}
|
||||
|
||||
template<typename Derived>
|
||||
inline bool isApprox(const MatrixBase<Derived>& other, const RealScalar& eps=NumTraits<Scalar>::dummy_precision()) const
|
||||
{
|
||||
return m_coeffs.template head<3>().isApprox(other,eps);
|
||||
}
|
||||
|
||||
CoeffType& coeffs() { return m_coeffs; }
|
||||
const CoeffType& coeffs() const { return m_coeffs; }
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename _Scalar>
|
||||
struct eval<AlignedVector3<_Scalar>, Dense>
|
||||
{
|
||||
typedef const AlignedVector3<_Scalar>& type;
|
||||
};
|
||||
|
||||
template<typename Scalar>
|
||||
struct evaluator<AlignedVector3<Scalar> >
|
||||
: evaluator<Matrix<Scalar,4,1> >
|
||||
{
|
||||
typedef AlignedVector3<Scalar> XprType;
|
||||
typedef evaluator<Matrix<Scalar,4,1> > Base;
|
||||
|
||||
evaluator(const XprType &m) : Base(m.coeffs()) {}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
//@}
|
||||
|
||||
}
|
||||
|
||||
#endif // EIGEN_ALIGNED_VECTOR3
|
|
@ -0,0 +1,31 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_ARPACKSUPPORT_MODULE_H
|
||||
#define EIGEN_ARPACKSUPPORT_MODULE_H
|
||||
|
||||
#include <Eigen/Core>
|
||||
|
||||
/** \defgroup ArpackSupport_Module Arpack support module
|
||||
*
|
||||
* This module provides a wrapper to Arpack, a library for sparse eigenvalue decomposition.
|
||||
*
|
||||
* \code
|
||||
* #include <Eigen/ArpackSupport>
|
||||
* \endcode
|
||||
*/
|
||||
|
||||
#include <Eigen/SparseCholesky>
|
||||
|
||||
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
|
||||
#include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h"
|
||||
|
||||
#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
|
||||
|
||||
#endif // EIGEN_ARPACKSUPPORT_MODULE_H
|
||||
/* vim: set filetype=cpp et sw=2 ts=2 ai: */
|
|
@ -0,0 +1,40 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_AUTODIFF_MODULE
|
||||
#define EIGEN_AUTODIFF_MODULE
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/**
|
||||
* \defgroup AutoDiff_Module Auto Diff module
|
||||
*
|
||||
* This module features forward automatic differentation via a simple
|
||||
* templated scalar type wrapper AutoDiffScalar.
|
||||
*
|
||||
* Warning : this should NOT be confused with numerical differentiation, which
|
||||
* is a different method and has its own module in Eigen : \ref NumericalDiff_Module.
|
||||
*
|
||||
* \code
|
||||
* #include <unsupported/Eigen/AutoDiff>
|
||||
* \endcode
|
||||
*/
|
||||
//@{
|
||||
|
||||
}
|
||||
|
||||
#include "src/AutoDiff/AutoDiffScalar.h"
|
||||
// #include "src/AutoDiff/AutoDiffVector.h"
|
||||
#include "src/AutoDiff/AutoDiffJacobian.h"
|
||||
|
||||
namespace Eigen {
|
||||
//@}
|
||||
}
|
||||
|
||||
#endif // EIGEN_AUTODIFF_MODULE
|
|
@ -0,0 +1,95 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2009 Ilya Baran <ibaran@mit.edu>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_BVH_MODULE_H
|
||||
#define EIGEN_BVH_MODULE_H
|
||||
|
||||
#include <Eigen/Core>
|
||||
#include <Eigen/Geometry>
|
||||
#include <Eigen/StdVector>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/**
|
||||
* \defgroup BVH_Module BVH module
|
||||
* \brief This module provides generic bounding volume hierarchy algorithms
|
||||
* and reference tree implementations.
|
||||
*
|
||||
*
|
||||
* \code
|
||||
* #include <unsupported/Eigen/BVH>
|
||||
* \endcode
|
||||
*
|
||||
* A bounding volume hierarchy (BVH) can accelerate many geometric queries. This module provides a generic implementation
|
||||
* of the two basic algorithms over a BVH: intersection of a query object against all objects in the hierarchy and minimization
|
||||
* of a function over the objects in the hierarchy. It also provides intersection and minimization over a cartesian product of
|
||||
* two BVH's. A BVH accelerates intersection by using the fact that if a query object does not intersect a volume, then it cannot
|
||||
* intersect any object contained in that volume. Similarly, a BVH accelerates minimization because the minimum of a function
|
||||
* over a volume is no greater than the minimum of a function over any object contained in it.
|
||||
*
|
||||
* Some sample queries that can be written in terms of intersection are:
|
||||
* - Determine all points where a ray intersects a triangle mesh
|
||||
* - Given a set of points, determine which are contained in a query sphere
|
||||
* - Given a set of spheres, determine which contain the query point
|
||||
* - Given a set of disks, determine if any is completely contained in a query rectangle (represent each 2D disk as a point \f$(x,y,r)\f$
|
||||
* in 3D and represent the rectangle as a pyramid based on the original rectangle and shrinking in the \f$r\f$ direction)
|
||||
* - Given a set of points, count how many pairs are \f$d\pm\epsilon\f$ apart (done by looking at the cartesian product of the set
|
||||
* of points with itself)
|
||||
*
|
||||
* Some sample queries that can be written in terms of function minimization over a set of objects are:
|
||||
* - Find the intersection between a ray and a triangle mesh closest to the ray origin (function is infinite off the ray)
|
||||
* - Given a polyline and a query point, determine the closest point on the polyline to the query
|
||||
* - Find the diameter of a point cloud (done by looking at the cartesian product and using negative distance as the function)
|
||||
* - Determine how far two meshes are from colliding (this is also a cartesian product query)
|
||||
*
|
||||
* This implementation decouples the basic algorithms both from the type of hierarchy (and the types of the bounding volumes) and
|
||||
* from the particulars of the query. To enable abstraction from the BVH, the BVH is required to implement a generic mechanism
|
||||
* for traversal. To abstract from the query, the query is responsible for keeping track of results.
|
||||
*
|
||||
* To be used in the algorithms, a hierarchy must implement the following traversal mechanism (see KdBVH for a sample implementation): \code
|
||||
typedef Volume //the type of bounding volume
|
||||
typedef Object //the type of object in the hierarchy
|
||||
typedef Index //a reference to a node in the hierarchy--typically an int or a pointer
|
||||
typedef VolumeIterator //an iterator type over node children--returns Index
|
||||
typedef ObjectIterator //an iterator over object (leaf) children--returns const Object &
|
||||
Index getRootIndex() const //returns the index of the hierarchy root
|
||||
const Volume &getVolume(Index index) const //returns the bounding volume of the node at given index
|
||||
void getChildren(Index index, VolumeIterator &outVBegin, VolumeIterator &outVEnd,
|
||||
ObjectIterator &outOBegin, ObjectIterator &outOEnd) const
|
||||
//getChildren takes a node index and makes [outVBegin, outVEnd) range over its node children
|
||||
//and [outOBegin, outOEnd) range over its object children
|
||||
\endcode
|
||||
*
|
||||
* To use the hierarchy, call BVIntersect or BVMinimize, passing it a BVH (or two, for cartesian product) and a minimizer or intersector.
|
||||
* For an intersection query on a single BVH, the intersector encapsulates the query and must provide two functions:
|
||||
* \code
|
||||
bool intersectVolume(const Volume &volume) //returns true if the query intersects the volume
|
||||
bool intersectObject(const Object &object) //returns true if the intersection search should terminate immediately
|
||||
\endcode
|
||||
* The guarantee that BVIntersect provides is that intersectObject will be called on every object whose bounding volume
|
||||
* intersects the query (but possibly on other objects too) unless the search is terminated prematurely. It is the
|
||||
* responsibility of the intersectObject function to keep track of the results in whatever manner is appropriate.
|
||||
* The cartesian product intersection and the BVMinimize queries are similar--see their individual documentation.
|
||||
*
|
||||
* The following is a simple but complete example for how to use the BVH to accelerate the search for a closest red-blue point pair:
|
||||
* \include BVH_Example.cpp
|
||||
* Output: \verbinclude BVH_Example.out
|
||||
*/
|
||||
}
|
||||
|
||||
//@{
|
||||
|
||||
#include "src/BVH/BVAlgorithms.h"
|
||||
#include "src/BVH/KdBVH.h"
|
||||
|
||||
//@}
|
||||
|
||||
#endif // EIGEN_BVH_MODULE_H
|
|
@ -0,0 +1,32 @@
|
|||
set(Eigen_HEADERS
|
||||
AdolcForward
|
||||
AlignedVector3
|
||||
ArpackSupport
|
||||
AutoDiff
|
||||
BVH
|
||||
EulerAngles
|
||||
FFT
|
||||
IterativeSolvers
|
||||
KroneckerProduct
|
||||
LevenbergMarquardt
|
||||
MatrixFunctions
|
||||
MoreVectorization
|
||||
MPRealSupport
|
||||
NonLinearOptimization
|
||||
NumericalDiff
|
||||
OpenGLSupport
|
||||
Polynomials
|
||||
Skyline
|
||||
SparseExtra
|
||||
SpecialFunctions
|
||||
Splines
|
||||
)
|
||||
|
||||
install(FILES
|
||||
${Eigen_HEADERS}
|
||||
DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel
|
||||
)
|
||||
|
||||
install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
|
||||
|
||||
add_subdirectory(CXX11)
|
|
@ -0,0 +1,8 @@
|
|||
set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool)
|
||||
|
||||
install(FILES
|
||||
${Eigen_CXX11_HEADERS}
|
||||
DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
|
||||
)
|
||||
|
||||
install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h")
|
|
@ -0,0 +1,154 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
//#ifndef EIGEN_CXX11_TENSOR_MODULE
|
||||
//#define EIGEN_CXX11_TENSOR_MODULE
|
||||
|
||||
#include "../../../Eigen/Core"
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
#undef min
|
||||
#undef max
|
||||
#undef isnan
|
||||
#undef isinf
|
||||
#undef isfinite
|
||||
#include <SYCL/sycl.hpp>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#endif
|
||||
|
||||
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
|
||||
|
||||
#include "../SpecialFunctions"
|
||||
#include "src/util/CXX11Meta.h"
|
||||
#include "src/util/MaxSizeVector.h"
|
||||
|
||||
/** \defgroup CXX11_Tensor_Module Tensor Module
|
||||
*
|
||||
* This module provides a Tensor class for storing arbitrarily indexed
|
||||
* objects.
|
||||
*
|
||||
* \code
|
||||
* #include <Eigen/CXX11/Tensor>
|
||||
* \endcode
|
||||
*
|
||||
* Much of the documentation can be found \ref eigen_tensors "here".
|
||||
*/
|
||||
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <cstring>
|
||||
|
||||
#ifdef _WIN32
|
||||
typedef __int16 int16_t;
|
||||
typedef unsigned __int16 uint16_t;
|
||||
typedef __int32 int32_t;
|
||||
typedef unsigned __int32 uint32_t;
|
||||
typedef __int64 int64_t;
|
||||
typedef unsigned __int64 uint64_t;
|
||||
#else
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
|
||||
#include <random>
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#elif defined(__APPLE__)
|
||||
#include <mach/mach_time.h>
|
||||
#else
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_USE_THREADS
|
||||
#include "ThreadPool"
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_USE_GPU
|
||||
#include <iostream>
|
||||
#include <cuda_runtime.h>
|
||||
#if __cplusplus >= 201103L
|
||||
#include <atomic>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "src/Tensor/TensorMacros.h"
|
||||
#include "src/Tensor/TensorForwardDeclarations.h"
|
||||
#include "src/Tensor/TensorMeta.h"
|
||||
#include "src/Tensor/TensorFunctors.h"
|
||||
#include "src/Tensor/TensorCostModel.h"
|
||||
#include "src/Tensor/TensorDeviceDefault.h"
|
||||
#include "src/Tensor/TensorDeviceThreadPool.h"
|
||||
#include "src/Tensor/TensorDeviceCuda.h"
|
||||
#include "src/Tensor/TensorDeviceSycl.h"
|
||||
#include "src/Tensor/TensorIndexList.h"
|
||||
#include "src/Tensor/TensorDimensionList.h"
|
||||
#include "src/Tensor/TensorDimensions.h"
|
||||
#include "src/Tensor/TensorInitializer.h"
|
||||
#include "src/Tensor/TensorTraits.h"
|
||||
#include "src/Tensor/TensorRandom.h"
|
||||
#include "src/Tensor/TensorUInt128.h"
|
||||
#include "src/Tensor/TensorIntDiv.h"
|
||||
#include "src/Tensor/TensorGlobalFunctions.h"
|
||||
|
||||
#include "src/Tensor/TensorBase.h"
|
||||
|
||||
#include "src/Tensor/TensorEvaluator.h"
|
||||
#include "src/Tensor/TensorExpr.h"
|
||||
#include "src/Tensor/TensorReduction.h"
|
||||
#include "src/Tensor/TensorReductionCuda.h"
|
||||
#include "src/Tensor/TensorArgMax.h"
|
||||
#include "src/Tensor/TensorConcatenation.h"
|
||||
#include "src/Tensor/TensorContractionMapper.h"
|
||||
#include "src/Tensor/TensorContractionBlocking.h"
|
||||
#include "src/Tensor/TensorContraction.h"
|
||||
#include "src/Tensor/TensorContractionThreadPool.h"
|
||||
#include "src/Tensor/TensorContractionCuda.h"
|
||||
#include "src/Tensor/TensorConversion.h"
|
||||
#include "src/Tensor/TensorConvolution.h"
|
||||
#include "src/Tensor/TensorFFT.h"
|
||||
#include "src/Tensor/TensorPatch.h"
|
||||
#include "src/Tensor/TensorImagePatch.h"
|
||||
#include "src/Tensor/TensorVolumePatch.h"
|
||||
#include "src/Tensor/TensorBroadcasting.h"
|
||||
#include "src/Tensor/TensorChipping.h"
|
||||
#include "src/Tensor/TensorInflation.h"
|
||||
#include "src/Tensor/TensorLayoutSwap.h"
|
||||
#include "src/Tensor/TensorMorphing.h"
|
||||
#include "src/Tensor/TensorPadding.h"
|
||||
#include "src/Tensor/TensorReverse.h"
|
||||
#include "src/Tensor/TensorShuffling.h"
|
||||
#include "src/Tensor/TensorStriding.h"
|
||||
#include "src/Tensor/TensorCustomOp.h"
|
||||
#include "src/Tensor/TensorEvalTo.h"
|
||||
#include "src/Tensor/TensorForcedEval.h"
|
||||
#include "src/Tensor/TensorGenerator.h"
|
||||
#include "src/Tensor/TensorAssign.h"
|
||||
#include "src/Tensor/TensorScan.h"
|
||||
|
||||
#include "src/Tensor/TensorSycl.h"
|
||||
#include "src/Tensor/TensorExecutor.h"
|
||||
#include "src/Tensor/TensorDevice.h"
|
||||
|
||||
#include "src/Tensor/TensorStorage.h"
|
||||
#include "src/Tensor/Tensor.h"
|
||||
#include "src/Tensor/TensorFixedSize.h"
|
||||
#include "src/Tensor/TensorMap.h"
|
||||
#include "src/Tensor/TensorRef.h"
|
||||
|
||||
#include "src/Tensor/TensorIO.h"
|
||||
|
||||
#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
|
||||
|
||||
//#endif // EIGEN_CXX11_TENSOR_MODULE
|
|
@ -0,0 +1,42 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE
|
||||
#define EIGEN_CXX11_TENSORSYMMETRY_MODULE
|
||||
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
|
||||
|
||||
#include "src/util/CXX11Meta.h"
|
||||
|
||||
/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module
|
||||
*
|
||||
* This module provides a classes that allow for the definition of
|
||||
* symmetries w.r.t. tensor indices.
|
||||
*
|
||||
* Including this module will implicitly include the Tensor module.
|
||||
*
|
||||
* \code
|
||||
* #include <Eigen/TensorSymmetry>
|
||||
* \endcode
|
||||
*/
|
||||
|
||||
#include "src/TensorSymmetry/util/TemplateGroupTheory.h"
|
||||
#include "src/TensorSymmetry/Symmetry.h"
|
||||
#include "src/TensorSymmetry/StaticSymmetry.h"
|
||||
#include "src/TensorSymmetry/DynamicSymmetry.h"
|
||||
|
||||
#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
|
||||
|
||||
#endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE
|
||||
|
||||
/*
|
||||
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
|
||||
*/
|
|
@ -0,0 +1,65 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_MODULE
|
||||
#define EIGEN_CXX11_THREADPOOL_MODULE
|
||||
|
||||
#include "../../../Eigen/Core"
|
||||
|
||||
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
|
||||
|
||||
/** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module
|
||||
*
|
||||
* This module provides 2 threadpool implementations
|
||||
* - a simple reference implementation
|
||||
* - a faster non blocking implementation
|
||||
*
|
||||
* This module requires C++11.
|
||||
*
|
||||
* \code
|
||||
* #include <Eigen/CXX11/ThreadPool>
|
||||
* \endcode
|
||||
*/
|
||||
|
||||
|
||||
// The code depends on CXX11, so only include the module if the
|
||||
// compiler supports it.
|
||||
#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
|
||||
#include <cstddef>
|
||||
#include <cstring>
|
||||
#include <stdint.h>
|
||||
#include <time.h>
|
||||
|
||||
#include <vector>
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <deque>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
|
||||
#include "src/util/CXX11Meta.h"
|
||||
#include "src/util/MaxSizeVector.h"
|
||||
|
||||
#include "src/ThreadPool/ThreadLocal.h"
|
||||
#include "src/ThreadPool/ThreadYield.h"
|
||||
#include "src/ThreadPool/EventCount.h"
|
||||
#include "src/ThreadPool/RunQueue.h"
|
||||
#include "src/ThreadPool/ThreadPoolInterface.h"
|
||||
#include "src/ThreadPool/ThreadEnvironment.h"
|
||||
#include "src/ThreadPool/SimpleThreadPool.h"
|
||||
#include "src/ThreadPool/NonBlockingThreadPool.h"
|
||||
|
||||
#endif
|
||||
|
||||
#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_MODULE
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,527 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class Tensor
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief The tensor class.
|
||||
*
|
||||
* The %Tensor class is the work-horse for all \em dense tensors within Eigen.
|
||||
*
|
||||
* The %Tensor class encompasses only dynamic-size objects so far.
|
||||
*
|
||||
* The first two template parameters are required:
|
||||
* \tparam Scalar_ Numeric type, e.g. float, double, int or `std::complex<float>`.
|
||||
* User defined scalar types are supported as well (see \ref user_defined_scalars "here").
|
||||
* \tparam NumIndices_ Number of indices (i.e. rank of the tensor)
|
||||
*
|
||||
* The remaining template parameters are optional -- in most cases you don't have to worry about them.
|
||||
* \tparam Options_ A combination of either \b #RowMajor or \b #ColMajor, and of either
|
||||
* \b #AutoAlign or \b #DontAlign.
|
||||
* The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required
|
||||
* for vectorization. It defaults to aligning tensors. Note that tensors currently do not support any operations that profit from vectorization.
|
||||
* Support for such operations (i.e. adding two tensors etc.) is planned.
|
||||
*
|
||||
* You can access elements of tensors using normal subscripting:
|
||||
*
|
||||
* \code
|
||||
* Eigen::Tensor<double, 4> t(10, 10, 10, 10);
|
||||
* t(0, 1, 2, 3) = 42.0;
|
||||
* \endcode
|
||||
*
|
||||
* This class can be extended with the help of the plugin mechanism described on the page
|
||||
* \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN.
|
||||
*
|
||||
* <i><b>Some notes:</b></i>
|
||||
*
|
||||
* <dl>
|
||||
* <dt><b>Relation to other parts of Eigen:</b></dt>
|
||||
* <dd>The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that
|
||||
* taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code
|
||||
* by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor
|
||||
* class does not provide any of these features and is only available as a stand-alone class that just allows for
|
||||
* coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to
|
||||
* change dramatically.</dd>
|
||||
* </dl>
|
||||
*
|
||||
* \ref TopicStorageOrders
|
||||
*/
|
||||
|
||||
template<typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
|
||||
class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
|
||||
{
|
||||
public:
|
||||
typedef Tensor<Scalar_, NumIndices_, Options_, IndexType_> Self;
|
||||
typedef TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > Base;
|
||||
typedef typename Eigen::internal::nested<Self>::type Nested;
|
||||
typedef typename internal::traits<Self>::StorageKind StorageKind;
|
||||
typedef typename internal::traits<Self>::Index Index;
|
||||
typedef Scalar_ Scalar;
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename Base::CoeffReturnType CoeffReturnType;
|
||||
|
||||
enum {
|
||||
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign),
|
||||
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
|
||||
CoordAccess = true,
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
static const int Options = Options_;
|
||||
static const int NumIndices = NumIndices_;
|
||||
typedef DSizes<Index, NumIndices_> Dimensions;
|
||||
|
||||
protected:
|
||||
TensorStorage<Scalar, Dimensions, Options> m_storage;
|
||||
|
||||
#ifdef EIGEN_HAS_SFINAE
|
||||
template<typename CustomIndices>
|
||||
struct isOfNormalIndex{
|
||||
static const bool is_array = internal::is_base_of<array<Index, NumIndices>, CustomIndices>::value;
|
||||
static const bool is_int = NumTraits<CustomIndices>::IsInteger;
|
||||
static const bool value = is_array | is_int;
|
||||
};
|
||||
#endif
|
||||
|
||||
public:
|
||||
// Metadata
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); }
|
||||
|
||||
// This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
|
||||
// work, because that uses base().coeffRef() - and we don't yet
|
||||
// implement a similar class hierarchy
|
||||
inline Self& base() { return *this; }
|
||||
inline const Self& base() const { return *this; }
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... IndexTypes>
|
||||
EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
|
||||
{
|
||||
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
|
||||
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
|
||||
}
|
||||
#endif
|
||||
|
||||
// normal indices
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
|
||||
{
|
||||
eigen_internal_assert(checkIndexRange(indices));
|
||||
return m_storage.data()[linearizedIndex(indices)];
|
||||
}
|
||||
|
||||
// custom indices
|
||||
#ifdef EIGEN_HAS_SFINAE
|
||||
template<typename CustomIndices,
|
||||
EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
|
||||
>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const
|
||||
{
|
||||
return coeff(internal::customIndices2Array<Index,NumIndices>(indices));
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return m_storage.data()[0];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
|
||||
{
|
||||
eigen_internal_assert(index >= 0 && index < size());
|
||||
return m_storage.data()[index];
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... IndexTypes>
|
||||
inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
|
||||
{
|
||||
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
|
||||
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
|
||||
}
|
||||
#endif
|
||||
|
||||
// normal indices
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
|
||||
{
|
||||
eigen_internal_assert(checkIndexRange(indices));
|
||||
return m_storage.data()[linearizedIndex(indices)];
|
||||
}
|
||||
|
||||
// custom indices
|
||||
#ifdef EIGEN_HAS_SFINAE
|
||||
template<typename CustomIndices,
|
||||
EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
|
||||
>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices)
|
||||
{
|
||||
return coeffRef(internal::customIndices2Array<Index,NumIndices>(indices));
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef()
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return m_storage.data()[0];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
|
||||
{
|
||||
eigen_internal_assert(index >= 0 && index < size());
|
||||
return m_storage.data()[index];
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... IndexTypes>
|
||||
inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
|
||||
{
|
||||
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
|
||||
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
|
||||
}
|
||||
#else
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
|
||||
{
|
||||
return coeff(array<Index, 2>(i0, i1));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
|
||||
{
|
||||
return coeff(array<Index, 3>(i0, i1, i2));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
|
||||
{
|
||||
return coeff(array<Index, 4>(i0, i1, i2, i3));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
|
||||
{
|
||||
return coeff(array<Index, 5>(i0, i1, i2, i3, i4));
|
||||
}
|
||||
#endif
|
||||
|
||||
// custom indices
|
||||
#ifdef EIGEN_HAS_SFINAE
|
||||
template<typename CustomIndices,
|
||||
EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
|
||||
>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const
|
||||
{
|
||||
return coeff(internal::customIndices2Array<Index,NumIndices>(indices));
|
||||
}
|
||||
#endif
|
||||
|
||||
// normal indices
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
|
||||
{
|
||||
return coeff(indices);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
|
||||
{
|
||||
eigen_internal_assert(index >= 0 && index < size());
|
||||
return coeff(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return coeff();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
|
||||
{
|
||||
// The bracket operator is only for vectors, use the parenthesis operator instead.
|
||||
EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return coeff(index);
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... IndexTypes>
|
||||
inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
|
||||
{
|
||||
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
|
||||
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
|
||||
}
|
||||
#else
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
|
||||
{
|
||||
return coeffRef(array<Index, 2>(i0, i1));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
|
||||
{
|
||||
return coeffRef(array<Index, 3>(i0, i1, i2));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
|
||||
{
|
||||
return coeffRef(array<Index, 4>(i0, i1, i2, i3));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
|
||||
{
|
||||
return coeffRef(array<Index, 5>(i0, i1, i2, i3, i4));
|
||||
}
|
||||
#endif
|
||||
|
||||
// normal indices
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
|
||||
{
|
||||
return coeffRef(indices);
|
||||
}
|
||||
|
||||
// custom indices
|
||||
#ifdef EIGEN_HAS_SFINAE
|
||||
template<typename CustomIndices,
|
||||
EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
|
||||
>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices)
|
||||
{
|
||||
return coeffRef(internal::customIndices2Array<Index,NumIndices>(indices));
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index)
|
||||
{
|
||||
eigen_assert(index >= 0 && index < size());
|
||||
return coeffRef(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()()
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return coeffRef();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index)
|
||||
{
|
||||
// The bracket operator is only for vectors, use the parenthesis operator instead
|
||||
EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
return coeffRef(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Tensor()
|
||||
: m_storage()
|
||||
{
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Tensor(const Self& other)
|
||||
: m_storage(other.m_storage)
|
||||
{
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... IndexTypes>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions)
|
||||
: m_storage(firstDimension, otherDimensions...)
|
||||
{
|
||||
// The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
|
||||
EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
}
|
||||
#else
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1)
|
||||
: m_storage(dim1, array<Index, 1>(dim1))
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2)
|
||||
: m_storage(dim1*dim2, array<Index, 2>(dim1, dim2))
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3)
|
||||
: m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3))
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
|
||||
: m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4))
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
|
||||
: m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 5>(dim1, dim2, dim3, dim4, dim5))
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
}
|
||||
#endif
|
||||
|
||||
/** Normal Dimension */
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array<Index, NumIndices>& dimensions)
|
||||
: m_storage(internal::array_prod(dimensions), dimensions)
|
||||
{
|
||||
EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
|
||||
}
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, ReadOnlyAccessors>& other)
|
||||
{
|
||||
typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
|
||||
Assign assign(*this, other.derived());
|
||||
resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
}
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other)
|
||||
{
|
||||
typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
|
||||
Assign assign(*this, other.derived());
|
||||
resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other)
|
||||
{
|
||||
typedef TensorAssignOp<Tensor, const Tensor> Assign;
|
||||
Assign assign(*this, other);
|
||||
resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other)
|
||||
{
|
||||
typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
|
||||
Assign assign(*this, other);
|
||||
resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
|
||||
void resize(Index firstDimension, IndexTypes... otherDimensions)
|
||||
{
|
||||
// The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
|
||||
EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}});
|
||||
}
|
||||
#endif
|
||||
|
||||
/** Normal Dimension */
|
||||
EIGEN_DEVICE_FUNC void resize(const array<Index, NumIndices>& dimensions)
|
||||
{
|
||||
int i;
|
||||
Index size = Index(1);
|
||||
for (i = 0; i < NumIndices; i++) {
|
||||
internal::check_rows_cols_for_overflow<Dynamic>::run(size, dimensions[i]);
|
||||
size *= dimensions[i];
|
||||
}
|
||||
#ifdef EIGEN_INITIALIZE_COEFFS
|
||||
bool size_changed = size != this->size();
|
||||
m_storage.resize(size, dimensions);
|
||||
if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
|
||||
#else
|
||||
m_storage.resize(size, dimensions);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Why this overload, DSizes is derived from array ??? //
|
||||
EIGEN_DEVICE_FUNC void resize(const DSizes<Index, NumIndices>& dimensions) {
|
||||
array<Index, NumIndices> dims;
|
||||
for (int i = 0; i < NumIndices; ++i) {
|
||||
dims[i] = dimensions[i];
|
||||
}
|
||||
resize(dims);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
void resize()
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
// Nothing to do: rank 0 tensors have fixed size
|
||||
}
|
||||
|
||||
/** Custom Dimension */
|
||||
#ifdef EIGEN_HAS_SFINAE
|
||||
template<typename CustomDimension,
|
||||
EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomDimension>::value) )
|
||||
>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions)
|
||||
{
|
||||
resize(internal::customIndices2Array<Index,NumIndices>(dimensions));
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_EMULATE_CXX11_META_H
|
||||
template <typename std::ptrdiff_t... Indices>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void resize(const Sizes<Indices...>& dimensions) {
|
||||
array<Index, NumIndices> dims;
|
||||
for (int i = 0; i < NumIndices; ++i) {
|
||||
dims[i] = static_cast<Index>(dimensions[i]);
|
||||
}
|
||||
resize(dims);
|
||||
}
|
||||
#else
|
||||
template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void resize(const Sizes<V1, V2, V3, V4, V5>& dimensions) {
|
||||
array<Index, NumIndices> dims;
|
||||
for (int i = 0; i < NumIndices; ++i) {
|
||||
dims[i] = static_cast<Index>(dimensions[i]);
|
||||
}
|
||||
resize(dims);
|
||||
}
|
||||
#endif
|
||||
|
||||
protected:
|
||||
|
||||
bool checkIndexRange(const array<Index, NumIndices>& indices) const
|
||||
{
|
||||
using internal::array_apply_and_reduce;
|
||||
using internal::array_zip_and_reduce;
|
||||
using internal::greater_equal_zero_op;
|
||||
using internal::logical_and_op;
|
||||
using internal::lesser_op;
|
||||
|
||||
return
|
||||
// check whether the indices are all >= 0
|
||||
array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
|
||||
// check whether the indices fit in the dimensions
|
||||
array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
|
||||
{
|
||||
if (Options&RowMajor) {
|
||||
return m_storage.dimensions().IndexOfRowMajor(indices);
|
||||
} else {
|
||||
return m_storage.dimensions().IndexOfColMajor(indices);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_H
|
|
@ -0,0 +1,299 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
|
||||
// Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
/** \class TensorIndexTuple
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor + Index Tuple class.
|
||||
*
|
||||
*
|
||||
*/
|
||||
template<typename XprType>
|
||||
struct traits<TensorIndexTupleOp<XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef Tuple<Index, typename XprTraits::Scalar> Scalar;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<typename XprType>
|
||||
struct eval<TensorIndexTupleOp<XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorIndexTupleOp<XprType>& type;
|
||||
};
|
||||
|
||||
template<typename XprType>
|
||||
struct nested<TensorIndexTupleOp<XprType>, 1,
|
||||
typename eval<TensorIndexTupleOp<XprType> >::type>
|
||||
{
|
||||
typedef TensorIndexTupleOp<XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
template<typename XprType>
|
||||
class TensorIndexTupleOp : public TensorBase<TensorIndexTupleOp<XprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorIndexTupleOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename Eigen::internal::nested<TensorIndexTupleOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorIndexTupleOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorIndexTupleOp>::Index Index;
|
||||
typedef Tuple<Index, typename XprType::CoeffReturnType> CoeffReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexTupleOp(const XprType& expr)
|
||||
: m_xpr(expr) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
};
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
|
||||
{
|
||||
typedef TensorIndexTupleOp<ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
|
||||
static const int NumDims = internal::array_size<Dimensions>::value;
|
||||
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
|
||||
PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device) { }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
|
||||
return m_impl.dimensions();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return CoeffReturnType(index, m_impl.coeff(index));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
/** \class TensorTupleIndex
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Converts to Tensor<Tuple<Index, Scalar> > and reduces to Tensor<Index>.
|
||||
*
|
||||
*/
|
||||
template<typename ReduceOp, typename Dims, typename XprType>
|
||||
struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef Index Scalar;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<typename ReduceOp, typename Dims, typename XprType>
|
||||
struct eval<TensorTupleReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>& type;
|
||||
};
|
||||
|
||||
template<typename ReduceOp, typename Dims, typename XprType>
|
||||
struct nested<TensorTupleReducerOp<ReduceOp, Dims, XprType>, 1,
|
||||
typename eval<TensorTupleReducerOp<ReduceOp, Dims, XprType> >::type>
|
||||
{
|
||||
typedef TensorTupleReducerOp<ReduceOp, Dims, XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
template<typename ReduceOp, typename Dims, typename XprType>
|
||||
class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Dims, XprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorTupleReducerOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename Eigen::internal::nested<TensorTupleReducerOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorTupleReducerOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorTupleReducerOp>::Index Index;
|
||||
typedef Index CoeffReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr,
|
||||
const ReduceOp& reduce_op,
|
||||
const int return_dim,
|
||||
const Dims& reduce_dims)
|
||||
: m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const ReduceOp& reduce_op() const { return m_reduce_op; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const Dims& reduce_dims() const { return m_reduce_dims; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
int return_dim() const { return m_return_dim; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const ReduceOp m_reduce_op;
|
||||
const int m_return_dim;
|
||||
const Dims m_reduce_dims;
|
||||
};
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename ReduceOp, typename Dims, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device>
|
||||
{
|
||||
typedef TensorTupleReducerOp<ReduceOp, Dims, ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename TensorIndexTupleOp<ArgType>::CoeffReturnType TupleType;
|
||||
typedef typename TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Dimensions Dimensions;
|
||||
typedef typename TensorEvaluator<const TensorIndexTupleOp<ArgType> , Device>::Dimensions InputDimensions;
|
||||
static const int NumDims = internal::array_size<InputDimensions>::value;
|
||||
typedef array<Index, NumDims> StrideDims;
|
||||
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
|
||||
PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_orig_impl(op.expression(), device),
|
||||
m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
|
||||
m_return_dim(op.return_dim()) {
|
||||
|
||||
gen_strides(m_orig_impl.dimensions(), m_strides);
|
||||
if (Layout == static_cast<int>(ColMajor)) {
|
||||
const Index total_size = internal::array_prod(m_orig_impl.dimensions());
|
||||
m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size;
|
||||
} else {
|
||||
const Index total_size = internal::array_prod(m_orig_impl.dimensions());
|
||||
m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size;
|
||||
}
|
||||
m_stride_div = m_strides[m_return_dim];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
|
||||
return m_impl.dimensions();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
|
||||
const TupleType v = m_impl.coeff(index);
|
||||
return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost = 1.0 +
|
||||
(m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost<Index>() + TensorOpCost::DivCost<Index>()));
|
||||
return m_orig_impl.costPerCoeff(vectorized) +
|
||||
m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost);
|
||||
}
|
||||
|
||||
private:
|
||||
EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) {
|
||||
if (m_return_dim < 0) {
|
||||
return; // Won't be using the strides.
|
||||
}
|
||||
eigen_assert(m_return_dim < NumDims &&
|
||||
"Asking to convert index to a dimension outside of the rank");
|
||||
|
||||
// Calculate m_stride_div and m_stride_mod, which are used to
|
||||
// calculate the value of an index w.r.t. the m_return_dim.
|
||||
if (Layout == static_cast<int>(ColMajor)) {
|
||||
strides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
strides[i] = strides[i-1] * dims[i-1];
|
||||
}
|
||||
} else {
|
||||
strides[NumDims-1] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
strides[i] = strides[i+1] * dims[i+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> m_orig_impl;
|
||||
TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device> m_impl;
|
||||
const int m_return_dim;
|
||||
StrideDims m_strides;
|
||||
Index m_stride_mod;
|
||||
Index m_stride_div;
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
|
|
@ -0,0 +1,181 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorAssign
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief The tensor assignment class.
|
||||
*
|
||||
* This class is represents the assignment of the values resulting from the evaluation of
|
||||
* the rhs expression to the memory locations denoted by the lhs expression.
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename LhsXprType, typename RhsXprType>
|
||||
struct traits<TensorAssignOp<LhsXprType, RhsXprType> >
|
||||
{
|
||||
typedef typename LhsXprType::Scalar Scalar;
|
||||
typedef typename traits<LhsXprType>::StorageKind StorageKind;
|
||||
typedef typename promote_index_type<typename traits<LhsXprType>::Index,
|
||||
typename traits<RhsXprType>::Index>::type Index;
|
||||
typedef typename LhsXprType::Nested LhsNested;
|
||||
typedef typename RhsXprType::Nested RhsNested;
|
||||
typedef typename remove_reference<LhsNested>::type _LhsNested;
|
||||
typedef typename remove_reference<RhsNested>::type _RhsNested;
|
||||
static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions;
|
||||
static const int Layout = internal::traits<LhsXprType>::Layout;
|
||||
|
||||
enum {
|
||||
Flags = 0
|
||||
};
|
||||
};
|
||||
|
||||
template<typename LhsXprType, typename RhsXprType>
|
||||
struct eval<TensorAssignOp<LhsXprType, RhsXprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorAssignOp<LhsXprType, RhsXprType>& type;
|
||||
};
|
||||
|
||||
template<typename LhsXprType, typename RhsXprType>
|
||||
struct nested<TensorAssignOp<LhsXprType, RhsXprType>, 1, typename eval<TensorAssignOp<LhsXprType, RhsXprType> >::type>
|
||||
{
|
||||
typedef TensorAssignOp<LhsXprType, RhsXprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename LhsXprType, typename RhsXprType>
|
||||
class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType> >
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorAssignOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename LhsXprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorAssignOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
|
||||
: m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
|
||||
|
||||
/** \returns the nested expressions */
|
||||
EIGEN_DEVICE_FUNC
|
||||
typename internal::remove_all<typename LhsXprType::Nested>::type&
|
||||
lhsExpression() const { return *((typename internal::remove_all<typename LhsXprType::Nested>::type*)&m_lhs_xpr); }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename RhsXprType::Nested>::type&
|
||||
rhsExpression() const { return m_rhs_xpr; }
|
||||
|
||||
protected:
|
||||
typename internal::remove_all<typename LhsXprType::Nested>::type& m_lhs_xpr;
|
||||
const typename internal::remove_all<typename RhsXprType::Nested>::type& m_rhs_xpr;
|
||||
};
|
||||
|
||||
|
||||
template<typename LeftArgType, typename RightArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
{
|
||||
typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
|
||||
m_leftImpl(op.lhsExpression(), device),
|
||||
m_rightImpl(op.rhsExpression(), device)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||
{
|
||||
// The dimensions of the lhs and the rhs tensors should be equal to prevent
|
||||
// overflows and ensure the result is fully initialized.
|
||||
// TODO: use left impl instead if right impl dimensions are known at compile time.
|
||||
return m_rightImpl.dimensions();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
|
||||
eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
|
||||
m_leftImpl.evalSubExprsIfNeeded(NULL);
|
||||
// If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
|
||||
// null value), attempt to evaluate the rhs expression in place. Returns true iff in place
|
||||
// evaluation isn't supported and the caller still needs to manually assign the values generated
|
||||
// by the rhs to the lhs.
|
||||
return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data());
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_leftImpl.cleanup();
|
||||
m_rightImpl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
|
||||
m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
|
||||
const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
|
||||
const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
|
||||
m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_leftImpl.coeff(index);
|
||||
}
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
|
||||
{
|
||||
return m_leftImpl.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
// We assume that evalPacket or evalScalar is called to perform the
|
||||
// assignment and account for the cost of the write here, but reduce left
|
||||
// cost by one load because we are using m_leftImpl.coeffRef.
|
||||
TensorOpCost left = m_leftImpl.costPerCoeff(vectorized);
|
||||
return m_rightImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(
|
||||
numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)),
|
||||
left.bytes_stored(), left.compute_cycles()) +
|
||||
TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); }
|
||||
|
||||
private:
|
||||
TensorEvaluator<LeftArgType, Device> m_leftImpl;
|
||||
TensorEvaluator<RightArgType, Device> m_rightImpl;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,392 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorBroadcasting
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor broadcasting class.
|
||||
*
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename Broadcast, typename XprType>
|
||||
struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<typename Broadcast, typename XprType>
|
||||
struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorBroadcastingOp<Broadcast, XprType>& type;
|
||||
};
|
||||
|
||||
template<typename Broadcast, typename XprType>
|
||||
struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type>
|
||||
{
|
||||
typedef TensorBroadcastingOp<Broadcast, XprType> type;
|
||||
};
|
||||
|
||||
template <typename Dims>
|
||||
struct is_input_scalar {
|
||||
static const bool value = false;
|
||||
};
|
||||
template <>
|
||||
struct is_input_scalar<Sizes<> > {
|
||||
static const bool value = true;
|
||||
};
|
||||
#ifndef EIGEN_EMULATE_CXX11_META_H
|
||||
template <typename std::size_t... Indices>
|
||||
struct is_input_scalar<Sizes<Indices...> > {
|
||||
static const bool value = (Sizes<Indices...>::total_size == 1);
|
||||
};
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename Broadcast, typename XprType>
|
||||
class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast)
|
||||
: m_xpr(expr), m_broadcast(broadcast) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const Broadcast& broadcast() const { return m_broadcast; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const Broadcast m_broadcast;
|
||||
};
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename Broadcast, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
{
|
||||
typedef TensorBroadcastingOp<Broadcast, ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = true,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_broadcast(op.broadcast()),m_impl(op.expression(), device)
|
||||
{
|
||||
// The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
|
||||
// and store the result in a scalar. Instead one should reshape the scalar into a a N-D
|
||||
// tensor with N >= 1 of 1 element first and then broadcast.
|
||||
EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
const InputDimensions& input_dims = m_impl.dimensions();
|
||||
const Broadcast& broadcast = op.broadcast();
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
eigen_assert(input_dims[i] > 0);
|
||||
m_dimensions[i] = input_dims[i] * broadcast[i];
|
||||
}
|
||||
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_inputStrides[0] = 1;
|
||||
m_outputStrides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
|
||||
m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
|
||||
}
|
||||
} else {
|
||||
m_inputStrides[NumDims-1] = 1;
|
||||
m_outputStrides[NumDims-1] = 1;
|
||||
for (int i = NumDims-2; i >= 0; --i) {
|
||||
m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
|
||||
m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
|
||||
return m_impl.coeff(0);
|
||||
}
|
||||
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
return coeffColMajor(index);
|
||||
} else {
|
||||
return coeffRowMajor(index);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: attempt to speed this up. The integer divisions and modulo are slow
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
|
||||
{
|
||||
Index inputIndex = 0;
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx = index / m_outputStrides[i];
|
||||
if (internal::index_statically_eq<Broadcast>(i, 1)) {
|
||||
eigen_assert(idx < m_impl.dimensions()[i]);
|
||||
inputIndex += idx * m_inputStrides[i];
|
||||
} else {
|
||||
if (internal::index_statically_eq<InputDimensions>(i, 1)) {
|
||||
eigen_assert(idx % m_impl.dimensions()[i] == 0);
|
||||
} else {
|
||||
inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
|
||||
}
|
||||
}
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
if (internal::index_statically_eq<Broadcast>(0, 1)) {
|
||||
eigen_assert(index < m_impl.dimensions()[0]);
|
||||
inputIndex += index;
|
||||
} else {
|
||||
if (internal::index_statically_eq<InputDimensions>(0, 1)) {
|
||||
eigen_assert(index % m_impl.dimensions()[0] == 0);
|
||||
} else {
|
||||
inputIndex += (index % m_impl.dimensions()[0]);
|
||||
}
|
||||
}
|
||||
return m_impl.coeff(inputIndex);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
|
||||
{
|
||||
Index inputIndex = 0;
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const Index idx = index / m_outputStrides[i];
|
||||
if (internal::index_statically_eq<Broadcast>(i, 1)) {
|
||||
eigen_assert(idx < m_impl.dimensions()[i]);
|
||||
inputIndex += idx * m_inputStrides[i];
|
||||
} else {
|
||||
if (internal::index_statically_eq<InputDimensions>(i, 1)) {
|
||||
eigen_assert(idx % m_impl.dimensions()[i] == 0);
|
||||
} else {
|
||||
inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
|
||||
}
|
||||
}
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
|
||||
eigen_assert(index < m_impl.dimensions()[NumDims-1]);
|
||||
inputIndex += index;
|
||||
} else {
|
||||
if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
|
||||
eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
|
||||
} else {
|
||||
inputIndex += (index % m_impl.dimensions()[NumDims-1]);
|
||||
}
|
||||
}
|
||||
return m_impl.coeff(inputIndex);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
|
||||
return internal::pset1<PacketReturnType>(m_impl.coeff(0));
|
||||
}
|
||||
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
return packetColMajor<LoadMode>(index);
|
||||
} else {
|
||||
return packetRowMajor<LoadMode>(index);
|
||||
}
|
||||
}
|
||||
|
||||
// Ignore the LoadMode and always use unaligned loads since we can't guarantee
|
||||
// the alignment at compile time.
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
const Index originalIndex = index;
|
||||
|
||||
Index inputIndex = 0;
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx = index / m_outputStrides[i];
|
||||
if (internal::index_statically_eq<Broadcast>(i, 1)) {
|
||||
eigen_assert(idx < m_impl.dimensions()[i]);
|
||||
inputIndex += idx * m_inputStrides[i];
|
||||
} else {
|
||||
if (internal::index_statically_eq<InputDimensions>(i, 1)) {
|
||||
eigen_assert(idx % m_impl.dimensions()[i] == 0);
|
||||
} else {
|
||||
inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
|
||||
}
|
||||
}
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
Index innermostLoc;
|
||||
if (internal::index_statically_eq<Broadcast>(0, 1)) {
|
||||
eigen_assert(index < m_impl.dimensions()[0]);
|
||||
innermostLoc = index;
|
||||
} else {
|
||||
if (internal::index_statically_eq<InputDimensions>(0, 1)) {
|
||||
eigen_assert(index % m_impl.dimensions()[0] == 0);
|
||||
innermostLoc = 0;
|
||||
} else {
|
||||
innermostLoc = index % m_impl.dimensions()[0];
|
||||
}
|
||||
}
|
||||
inputIndex += innermostLoc;
|
||||
|
||||
// Todo: this could be extended to the second dimension if we're not
|
||||
// broadcasting alongside the first dimension, and so on.
|
||||
if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
|
||||
return m_impl.template packet<Unaligned>(inputIndex);
|
||||
} else {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
values[0] = m_impl.coeff(inputIndex);
|
||||
for (int i = 1; i < PacketSize; ++i) {
|
||||
values[i] = coeffColMajor(originalIndex+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
const Index originalIndex = index;
|
||||
|
||||
Index inputIndex = 0;
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const Index idx = index / m_outputStrides[i];
|
||||
if (internal::index_statically_eq<Broadcast>(i, 1)) {
|
||||
eigen_assert(idx < m_impl.dimensions()[i]);
|
||||
inputIndex += idx * m_inputStrides[i];
|
||||
} else {
|
||||
if (internal::index_statically_eq<InputDimensions>(i, 1)) {
|
||||
eigen_assert(idx % m_impl.dimensions()[i] == 0);
|
||||
} else {
|
||||
inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
|
||||
}
|
||||
}
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
Index innermostLoc;
|
||||
if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
|
||||
eigen_assert(index < m_impl.dimensions()[NumDims-1]);
|
||||
innermostLoc = index;
|
||||
} else {
|
||||
if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
|
||||
eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
|
||||
innermostLoc = 0;
|
||||
} else {
|
||||
innermostLoc = index % m_impl.dimensions()[NumDims-1];
|
||||
}
|
||||
}
|
||||
inputIndex += innermostLoc;
|
||||
|
||||
// Todo: this could be extended to the second dimension if we're not
|
||||
// broadcasting alongside the first dimension, and so on.
|
||||
if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) {
|
||||
return m_impl.template packet<Unaligned>(inputIndex);
|
||||
} else {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
values[0] = m_impl.coeff(inputIndex);
|
||||
for (int i = 1; i < PacketSize; ++i) {
|
||||
values[i] = coeffRowMajor(originalIndex+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
double compute_cost = TensorOpCost::AddCost<Index>();
|
||||
if (NumDims > 0) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
compute_cost += TensorOpCost::DivCost<Index>();
|
||||
if (internal::index_statically_eq<Broadcast>(i, 1)) {
|
||||
compute_cost +=
|
||||
TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
|
||||
} else {
|
||||
if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
|
||||
compute_cost += TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::ModCost<Index>() +
|
||||
TensorOpCost::AddCost<Index>();
|
||||
}
|
||||
}
|
||||
compute_cost +=
|
||||
TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
|
||||
}
|
||||
}
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
|
||||
Broadcast functor() const { return m_broadcast; }
|
||||
|
||||
protected:
|
||||
const Broadcast m_broadcast;
|
||||
Dimensions m_dimensions;
|
||||
array<Index, NumDims> m_outputStrides;
|
||||
array<Index, NumDims> m_inputStrides;
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
};
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
|
|
@ -0,0 +1,384 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorKChippingReshaping
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
namespace internal {
|
||||
template<DenseIndex DimId, typename XprType>
|
||||
struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions - 1;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<DenseIndex DimId, typename XprType>
|
||||
struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorChippingOp<DimId, XprType>& type;
|
||||
};
|
||||
|
||||
template<DenseIndex DimId, typename XprType>
|
||||
struct nested<TensorChippingOp<DimId, XprType>, 1, typename eval<TensorChippingOp<DimId, XprType> >::type>
|
||||
{
|
||||
typedef TensorChippingOp<DimId, XprType> type;
|
||||
};
|
||||
|
||||
template <DenseIndex DimId>
|
||||
struct DimensionId
|
||||
{
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
|
||||
eigen_assert(dim == DimId);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
|
||||
return DimId;
|
||||
}
|
||||
};
|
||||
template <>
|
||||
struct DimensionId<Dynamic>
|
||||
{
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) {
|
||||
eigen_assert(dim >= 0);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
|
||||
return actual_dim;
|
||||
}
|
||||
private:
|
||||
const DenseIndex actual_dim;
|
||||
};
|
||||
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<DenseIndex DimId, typename XprType>
|
||||
class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
|
||||
: m_xpr(expr), m_offset(offset), m_dim(dim) {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const Index offset() const { return m_offset; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
const Index dim() const { return m_dim.actualDim(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorChippingOp, const TensorChippingOp> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorChippingOp, const OtherDerived> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const Index m_offset;
|
||||
const internal::DimensionId<DimId> m_dim;
|
||||
};
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<DenseIndex DimId, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
{
|
||||
typedef TensorChippingOp<DimId, ArgType> XprType;
|
||||
static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
static const int NumDims = NumInputDims-1;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
|
||||
enum {
|
||||
// Alignment can't be guaranteed at compile time since it depends on the
|
||||
// slice offsets.
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
eigen_assert(NumInputDims > m_dim.actualDim());
|
||||
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
eigen_assert(op.offset() < input_dims[m_dim.actualDim()]);
|
||||
|
||||
int j = 0;
|
||||
for (int i = 0; i < NumInputDims; ++i) {
|
||||
if (i != m_dim.actualDim()) {
|
||||
m_dimensions[j] = input_dims[i];
|
||||
++j;
|
||||
}
|
||||
}
|
||||
|
||||
m_stride = 1;
|
||||
m_inputStride = 1;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = 0; i < m_dim.actualDim(); ++i) {
|
||||
m_stride *= input_dims[i];
|
||||
m_inputStride *= input_dims[i];
|
||||
}
|
||||
} else {
|
||||
for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) {
|
||||
m_stride *= input_dims[i];
|
||||
m_inputStride *= input_dims[i];
|
||||
}
|
||||
}
|
||||
m_inputStride *= input_dims[m_dim.actualDim()];
|
||||
m_inputOffset = m_stride * op.offset();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_impl.coeff(srcCoeff(index));
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
|
||||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
|
||||
// m_stride is equal to 1, so let's avoid the integer division.
|
||||
eigen_assert(m_stride == 1);
|
||||
Index inputIndex = index * m_inputStride + m_inputOffset;
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = m_impl.coeff(inputIndex);
|
||||
inputIndex += m_inputStride;
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
} else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
|
||||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
|
||||
// m_stride is aways greater than index, so let's avoid the integer division.
|
||||
eigen_assert(m_stride > index);
|
||||
return m_impl.template packet<LoadMode>(index + m_inputOffset);
|
||||
} else {
|
||||
const Index idx = index / m_stride;
|
||||
const Index rem = index - idx * m_stride;
|
||||
if (rem + PacketSize <= m_stride) {
|
||||
Index inputIndex = idx * m_inputStride + m_inputOffset + rem;
|
||||
return m_impl.template packet<LoadMode>(inputIndex);
|
||||
} else {
|
||||
// Cross the stride boundary. Fallback to slow path.
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index);
|
||||
++index;
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
double cost = 0;
|
||||
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
|
||||
m_dim.actualDim() == 0) ||
|
||||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
|
||||
m_dim.actualDim() == NumInputDims - 1)) {
|
||||
cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
|
||||
} else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
|
||||
m_dim.actualDim() == NumInputDims - 1) ||
|
||||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
|
||||
m_dim.actualDim() == 0)) {
|
||||
cost += TensorOpCost::AddCost<Index>();
|
||||
} else {
|
||||
cost += 3 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>() +
|
||||
3 * TensorOpCost::AddCost<Index>();
|
||||
}
|
||||
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const {
|
||||
CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
|
||||
if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) ||
|
||||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) &&
|
||||
result) {
|
||||
return result + m_inputOffset;
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
|
||||
{
|
||||
Index inputIndex;
|
||||
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
|
||||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
|
||||
// m_stride is equal to 1, so let's avoid the integer division.
|
||||
eigen_assert(m_stride == 1);
|
||||
inputIndex = index * m_inputStride + m_inputOffset;
|
||||
} else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) ||
|
||||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
|
||||
// m_stride is aways greater than index, so let's avoid the integer division.
|
||||
eigen_assert(m_stride > index);
|
||||
inputIndex = index + m_inputOffset;
|
||||
} else {
|
||||
const Index idx = index / m_stride;
|
||||
inputIndex = idx * m_inputStride + m_inputOffset;
|
||||
index -= idx * m_stride;
|
||||
inputIndex += index;
|
||||
}
|
||||
return inputIndex;
|
||||
}
|
||||
|
||||
Dimensions m_dimensions;
|
||||
Index m_stride;
|
||||
Index m_inputOffset;
|
||||
Index m_inputStride;
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
const internal::DimensionId<DimId> m_dim;
|
||||
const Device& m_device;
|
||||
};
|
||||
|
||||
|
||||
// Eval as lvalue
|
||||
template<DenseIndex DimId, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
||||
: public TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
{
|
||||
typedef TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> Base;
|
||||
typedef TensorChippingOp<DimId, ArgType> XprType;
|
||||
static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
static const int NumDims = NumInputDims-1;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: Base(op, device)
|
||||
{ }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
{
|
||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||
}
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
|
||||
if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
|
||||
(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
|
||||
// m_stride is equal to 1, so let's avoid the integer division.
|
||||
eigen_assert(this->m_stride == 1);
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
this->m_impl.coeffRef(inputIndex) = values[i];
|
||||
inputIndex += this->m_inputStride;
|
||||
}
|
||||
} else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) ||
|
||||
(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
|
||||
// m_stride is aways greater than index, so let's avoid the integer division.
|
||||
eigen_assert(this->m_stride > index);
|
||||
this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
|
||||
} else {
|
||||
const Index idx = index / this->m_stride;
|
||||
const Index rem = index - idx * this->m_stride;
|
||||
if (rem + PacketSize <= this->m_stride) {
|
||||
const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem;
|
||||
this->m_impl.template writePacket<StoreMode>(inputIndex, x);
|
||||
} else {
|
||||
// Cross stride boundary. Fallback to slow path.
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
this->coeffRef(index) = values[i];
|
||||
++index;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
|
|
@ -0,0 +1,361 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorConcatenationOp
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor concatenation class.
|
||||
*
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename Axis, typename LhsXprType, typename RhsXprType>
|
||||
struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >
|
||||
{
|
||||
// Type promotion to handle the case where the types of the lhs and the rhs are different.
|
||||
typedef typename promote_storage_type<typename LhsXprType::Scalar,
|
||||
typename RhsXprType::Scalar>::ret Scalar;
|
||||
typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
|
||||
typename traits<RhsXprType>::StorageKind>::ret StorageKind;
|
||||
typedef typename promote_index_type<typename traits<LhsXprType>::Index,
|
||||
typename traits<RhsXprType>::Index>::type Index;
|
||||
typedef typename LhsXprType::Nested LhsNested;
|
||||
typedef typename RhsXprType::Nested RhsNested;
|
||||
typedef typename remove_reference<LhsNested>::type _LhsNested;
|
||||
typedef typename remove_reference<RhsNested>::type _RhsNested;
|
||||
static const int NumDimensions = traits<LhsXprType>::NumDimensions;
|
||||
static const int Layout = traits<LhsXprType>::Layout;
|
||||
enum { Flags = 0 };
|
||||
};
|
||||
|
||||
template<typename Axis, typename LhsXprType, typename RhsXprType>
|
||||
struct eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorConcatenationOp<Axis, LhsXprType, RhsXprType>& type;
|
||||
};
|
||||
|
||||
template<typename Axis, typename LhsXprType, typename RhsXprType>
|
||||
struct nested<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, 1, typename eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >::type>
|
||||
{
|
||||
typedef TensorConcatenationOp<Axis, LhsXprType, RhsXprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
template<typename Axis, typename LhsXprType, typename RhsXprType>
|
||||
class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar;
|
||||
typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind;
|
||||
typedef typename internal::traits<TensorConcatenationOp>::Index Index;
|
||||
typedef typename internal::nested<TensorConcatenationOp>::type Nested;
|
||||
typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
|
||||
typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis)
|
||||
: m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename LhsXprType::Nested>::type&
|
||||
lhsExpression() const { return m_lhs_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename RhsXprType::Nested>::type&
|
||||
rhsExpression() const { return m_rhs_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const TensorConcatenationOp& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorConcatenationOp, const TensorConcatenationOp> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const OtherDerived& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorConcatenationOp, const OtherDerived> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
protected:
|
||||
typename LhsXprType::Nested m_lhs_xpr;
|
||||
typename RhsXprType::Nested m_rhs_xpr;
|
||||
const Axis m_axis;
|
||||
};
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename Axis, typename LeftArgType, typename RightArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
|
||||
{
|
||||
typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
|
||||
static const int RightNumDims = internal::array_size<typename TensorEvaluator<RightArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
|
||||
eigen_assert(0 <= m_axis && m_axis < NumDims);
|
||||
const Dimensions& lhs_dims = m_leftImpl.dimensions();
|
||||
const Dimensions& rhs_dims = m_rightImpl.dimensions();
|
||||
{
|
||||
int i = 0;
|
||||
for (; i < m_axis; ++i) {
|
||||
eigen_assert(lhs_dims[i] > 0);
|
||||
eigen_assert(lhs_dims[i] == rhs_dims[i]);
|
||||
m_dimensions[i] = lhs_dims[i];
|
||||
}
|
||||
eigen_assert(lhs_dims[i] > 0); // Now i == m_axis.
|
||||
eigen_assert(rhs_dims[i] > 0);
|
||||
m_dimensions[i] = lhs_dims[i] + rhs_dims[i];
|
||||
for (++i; i < NumDims; ++i) {
|
||||
eigen_assert(lhs_dims[i] > 0);
|
||||
eigen_assert(lhs_dims[i] == rhs_dims[i]);
|
||||
m_dimensions[i] = lhs_dims[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_leftStrides[0] = 1;
|
||||
m_rightStrides[0] = 1;
|
||||
m_outputStrides[0] = 1;
|
||||
|
||||
for (int j = 1; j < NumDims; ++j) {
|
||||
m_leftStrides[j] = m_leftStrides[j-1] * lhs_dims[j-1];
|
||||
m_rightStrides[j] = m_rightStrides[j-1] * rhs_dims[j-1];
|
||||
m_outputStrides[j] = m_outputStrides[j-1] * m_dimensions[j-1];
|
||||
}
|
||||
} else {
|
||||
m_leftStrides[NumDims - 1] = 1;
|
||||
m_rightStrides[NumDims - 1] = 1;
|
||||
m_outputStrides[NumDims - 1] = 1;
|
||||
|
||||
for (int j = NumDims - 2; j >= 0; --j) {
|
||||
m_leftStrides[j] = m_leftStrides[j+1] * lhs_dims[j+1];
|
||||
m_rightStrides[j] = m_rightStrides[j+1] * rhs_dims[j+1];
|
||||
m_outputStrides[j] = m_outputStrides[j+1] * m_dimensions[j+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
// TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/)
|
||||
{
|
||||
m_leftImpl.evalSubExprsIfNeeded(NULL);
|
||||
m_rightImpl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
|
||||
{
|
||||
m_leftImpl.cleanup();
|
||||
m_rightImpl.cleanup();
|
||||
}
|
||||
|
||||
// TODO(phli): attempt to speed this up. The integer divisions and modulo are slow.
|
||||
// See CL/76180724 comments for more ideas.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
// Collect dimension-wise indices (subs).
|
||||
array<Index, NumDims> subs;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
subs[i] = index / m_outputStrides[i];
|
||||
index -= subs[i] * m_outputStrides[i];
|
||||
}
|
||||
subs[0] = index;
|
||||
} else {
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
subs[i] = index / m_outputStrides[i];
|
||||
index -= subs[i] * m_outputStrides[i];
|
||||
}
|
||||
subs[NumDims - 1] = index;
|
||||
}
|
||||
|
||||
const Dimensions& left_dims = m_leftImpl.dimensions();
|
||||
if (subs[m_axis] < left_dims[m_axis]) {
|
||||
Index left_index;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
left_index = subs[0];
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
|
||||
}
|
||||
} else {
|
||||
left_index = subs[NumDims - 1];
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
|
||||
}
|
||||
}
|
||||
return m_leftImpl.coeff(left_index);
|
||||
} else {
|
||||
subs[m_axis] -= left_dims[m_axis];
|
||||
const Dimensions& right_dims = m_rightImpl.dimensions();
|
||||
Index right_index;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
right_index = subs[0];
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
|
||||
}
|
||||
} else {
|
||||
right_index = subs[NumDims - 1];
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
|
||||
}
|
||||
}
|
||||
return m_rightImpl.coeff(right_index);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(phli): Add a real vectorization.
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
|
||||
|
||||
EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
|
||||
2 * TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>() +
|
||||
TensorOpCost::ModCost<Index>());
|
||||
const double lhs_size = m_leftImpl.dimensions().TotalSize();
|
||||
const double rhs_size = m_rightImpl.dimensions().TotalSize();
|
||||
return (lhs_size / (lhs_size + rhs_size)) *
|
||||
m_leftImpl.costPerCoeff(vectorized) +
|
||||
(rhs_size / (lhs_size + rhs_size)) *
|
||||
m_rightImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
Dimensions m_dimensions;
|
||||
array<Index, NumDims> m_outputStrides;
|
||||
array<Index, NumDims> m_leftStrides;
|
||||
array<Index, NumDims> m_rightStrides;
|
||||
TensorEvaluator<LeftArgType, Device> m_leftImpl;
|
||||
TensorEvaluator<RightArgType, Device> m_rightImpl;
|
||||
const Axis m_axis;
|
||||
};
|
||||
|
||||
// Eval as lvalue
|
||||
template<typename Axis, typename LeftArgType, typename RightArgType, typename Device>
|
||||
struct TensorEvaluator<TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
|
||||
: public TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
|
||||
{
|
||||
typedef TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> Base;
|
||||
typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
|
||||
typedef typename Base::Dimensions Dimensions;
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
|
||||
: Base(op, device)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
}
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
{
|
||||
// Collect dimension-wise indices (subs).
|
||||
array<Index, Base::NumDims> subs;
|
||||
for (int i = Base::NumDims - 1; i > 0; --i) {
|
||||
subs[i] = index / this->m_outputStrides[i];
|
||||
index -= subs[i] * this->m_outputStrides[i];
|
||||
}
|
||||
subs[0] = index;
|
||||
|
||||
const Dimensions& left_dims = this->m_leftImpl.dimensions();
|
||||
if (subs[this->m_axis] < left_dims[this->m_axis]) {
|
||||
Index left_index = subs[0];
|
||||
for (int i = 1; i < Base::NumDims; ++i) {
|
||||
left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i];
|
||||
}
|
||||
return this->m_leftImpl.coeffRef(left_index);
|
||||
} else {
|
||||
subs[this->m_axis] -= left_dims[this->m_axis];
|
||||
const Dimensions& right_dims = this->m_rightImpl.dimensions();
|
||||
Index right_index = subs[0];
|
||||
for (int i = 1; i < Base::NumDims; ++i) {
|
||||
right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i];
|
||||
}
|
||||
return this->m_rightImpl.coeffRef(right_index);
|
||||
}
|
||||
}
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
|
||||
|
||||
EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
coeffRef(index+i) = values[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
|
|
@ -0,0 +1,628 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorContraction
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor contraction class.
|
||||
*
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
|
||||
template<typename Dimensions, typename LhsXprType, typename RhsXprType>
|
||||
struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
|
||||
{
|
||||
// Type promotion to handle the case where the types of the lhs and the rhs are different.
|
||||
typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
|
||||
typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar;
|
||||
|
||||
typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
|
||||
typename traits<RhsXprType>::StorageKind>::ret StorageKind;
|
||||
typedef typename promote_index_type<typename traits<LhsXprType>::Index,
|
||||
typename traits<RhsXprType>::Index>::type Index;
|
||||
typedef typename LhsXprType::Nested LhsNested;
|
||||
typedef typename RhsXprType::Nested RhsNested;
|
||||
typedef typename remove_reference<LhsNested>::type _LhsNested;
|
||||
typedef typename remove_reference<RhsNested>::type _RhsNested;
|
||||
|
||||
// From NumDims below.
|
||||
static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
|
||||
static const int Layout = traits<LhsXprType>::Layout;
|
||||
|
||||
enum {
|
||||
Flags = 0
|
||||
};
|
||||
};
|
||||
|
||||
template<typename Dimensions, typename LhsXprType, typename RhsXprType>
|
||||
struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType>& type;
|
||||
};
|
||||
|
||||
template<typename Dimensions, typename LhsXprType, typename RhsXprType>
|
||||
struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >::type>
|
||||
{
|
||||
typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType> type;
|
||||
};
|
||||
|
||||
template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename Device_>
|
||||
struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_>, Device_> > {
|
||||
typedef Indices_ Indices;
|
||||
typedef LeftArgType_ LeftArgType;
|
||||
typedef RightArgType_ RightArgType;
|
||||
typedef Device_ Device;
|
||||
|
||||
// From NumDims below.
|
||||
static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
template<typename Indices, typename LhsXprType, typename RhsXprType>
|
||||
class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
|
||||
typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
|
||||
typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(
|
||||
const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims)
|
||||
: m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const Indices& indices() const { return m_indices; }
|
||||
|
||||
/** \returns the nested expressions */
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename LhsXprType::Nested>::type&
|
||||
lhsExpression() const { return m_lhs_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename RhsXprType::Nested>::type&
|
||||
rhsExpression() const { return m_rhs_xpr; }
|
||||
|
||||
protected:
|
||||
typename LhsXprType::Nested m_lhs_xpr;
|
||||
typename RhsXprType::Nested m_rhs_xpr;
|
||||
const Indices m_indices;
|
||||
};
|
||||
|
||||
|
||||
template<typename Derived>
|
||||
struct TensorContractionEvaluatorBase
|
||||
{
|
||||
typedef typename internal::traits<Derived>::Indices Indices;
|
||||
typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
|
||||
typedef typename internal::traits<Derived>::RightArgType RightArgType;
|
||||
typedef typename internal::traits<Derived>::Device Device;
|
||||
|
||||
typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
|
||||
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
enum {
|
||||
IsAligned = true,
|
||||
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
// Most of the code is assuming that both input tensors are ColMajor. If the
|
||||
// inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
|
||||
// If we want to compute A * B = C, where A is LHS and B is RHS, the code
|
||||
// will pretend B is LHS and A is RHS.
|
||||
typedef typename internal::conditional<
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
|
||||
typedef typename internal::conditional<
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
|
||||
|
||||
static const int LDims =
|
||||
internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
|
||||
static const int RDims =
|
||||
internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
|
||||
static const int ContractDims = internal::array_size<Indices>::value;
|
||||
static const int NumDims = LDims + RDims - 2 * ContractDims;
|
||||
|
||||
typedef array<Index, ContractDims> contract_t;
|
||||
typedef array<Index, LDims - ContractDims> left_nocontract_t;
|
||||
typedef array<Index, RDims - ContractDims> right_nocontract_t;
|
||||
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
TensorContractionEvaluatorBase(const XprType& op, const Device& device)
|
||||
: m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
|
||||
op.lhsExpression(), op.rhsExpression()), device),
|
||||
m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
|
||||
op.rhsExpression(), op.lhsExpression()), device),
|
||||
m_device(device),
|
||||
m_result(NULL) {
|
||||
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
|
||||
static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
|
||||
YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
|
||||
|
||||
DSizes<Index, LDims> eval_left_dims;
|
||||
DSizes<Index, RDims> eval_right_dims;
|
||||
array<IndexPair<Index>, ContractDims> eval_op_indices;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
// For ColMajor, we keep using the existing dimensions
|
||||
for (int i = 0; i < LDims; i++) {
|
||||
eval_left_dims[i] = m_leftImpl.dimensions()[i];
|
||||
}
|
||||
for (int i = 0; i < RDims; i++) {
|
||||
eval_right_dims[i] = m_rightImpl.dimensions()[i];
|
||||
}
|
||||
// We keep the pairs of contracting indices.
|
||||
for (int i = 0; i < ContractDims; i++) {
|
||||
eval_op_indices[i].first = op.indices()[i].first;
|
||||
eval_op_indices[i].second = op.indices()[i].second;
|
||||
}
|
||||
} else {
|
||||
// For RowMajor, we need to reverse the existing dimensions
|
||||
for (int i = 0; i < LDims; i++) {
|
||||
eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1];
|
||||
}
|
||||
for (int i = 0; i < RDims; i++) {
|
||||
eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1];
|
||||
}
|
||||
// We need to flip all the pairs of contracting indices as well as
|
||||
// reversing the dimensions.
|
||||
for (int i = 0; i < ContractDims; i++) {
|
||||
eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second;
|
||||
eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first;
|
||||
}
|
||||
}
|
||||
|
||||
// Check for duplicate axes and make sure the first index in eval_op_indices
|
||||
// is increasing. Using O(n^2) sorting is OK since ContractDims is small
|
||||
for (int i = 0; i < ContractDims; i++) {
|
||||
for (int j = i + 1; j < ContractDims; j++) {
|
||||
eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first &&
|
||||
eval_op_indices[j].second != eval_op_indices[i].second &&
|
||||
"contraction axes should be unique");
|
||||
if (eval_op_indices[j].first < eval_op_indices[i].first) {
|
||||
numext::swap(eval_op_indices[j], eval_op_indices[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
array<Index, LDims> lhs_strides;
|
||||
lhs_strides[0] = 1;
|
||||
for (int i = 0; i < LDims-1; ++i) {
|
||||
lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i];
|
||||
}
|
||||
|
||||
array<Index, RDims> rhs_strides;
|
||||
rhs_strides[0] = 1;
|
||||
for (int i = 0; i < RDims-1; ++i) {
|
||||
rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
|
||||
}
|
||||
|
||||
if (m_i_strides.size() > 0) m_i_strides[0] = 1;
|
||||
if (m_j_strides.size() > 0) m_j_strides[0] = 1;
|
||||
if (m_k_strides.size() > 0) m_k_strides[0] = 1;
|
||||
|
||||
m_i_size = 1;
|
||||
m_j_size = 1;
|
||||
m_k_size = 1;
|
||||
|
||||
// To compute the dimension, we simply concatenate the non-contracting
|
||||
// dimensions of the left and then the right tensor. Additionally, we also
|
||||
// compute the strides corresponding to the left non-contracting
|
||||
// dimensions and right non-contracting dimensions.
|
||||
m_lhs_inner_dim_contiguous = true;
|
||||
int dim_idx = 0;
|
||||
unsigned int nocontract_idx = 0;
|
||||
|
||||
for (int i = 0; i < LDims; i++) {
|
||||
// find if we are contracting on index i of left tensor
|
||||
bool contracting = false;
|
||||
for (int j = 0; j < ContractDims; j++) {
|
||||
if (eval_op_indices[j].first == i) {
|
||||
contracting = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!contracting) {
|
||||
// add dimension size to output dimensions
|
||||
m_dimensions[dim_idx] = eval_left_dims[i];
|
||||
m_left_nocontract_strides[nocontract_idx] = lhs_strides[i];
|
||||
if (dim_idx != i) {
|
||||
m_lhs_inner_dim_contiguous = false;
|
||||
}
|
||||
if (nocontract_idx+1 < internal::array_size<left_nocontract_t>::value) {
|
||||
m_i_strides[nocontract_idx+1] =
|
||||
m_i_strides[nocontract_idx] * eval_left_dims[i];
|
||||
} else {
|
||||
m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i];
|
||||
}
|
||||
dim_idx++;
|
||||
nocontract_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
nocontract_idx = 0;
|
||||
for (int i = 0; i < RDims; i++) {
|
||||
bool contracting = false;
|
||||
// find if we are contracting on index i of right tensor
|
||||
for (int j = 0; j < ContractDims; j++) {
|
||||
if (eval_op_indices[j].second == i) {
|
||||
contracting = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!contracting) {
|
||||
m_dimensions[dim_idx] = eval_right_dims[i];
|
||||
if (nocontract_idx+1 < internal::array_size<right_nocontract_t>::value) {
|
||||
m_j_strides[nocontract_idx+1] =
|
||||
m_j_strides[nocontract_idx] * eval_right_dims[i];
|
||||
} else {
|
||||
m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i];
|
||||
}
|
||||
m_right_nocontract_strides[nocontract_idx] = rhs_strides[i];
|
||||
dim_idx++;
|
||||
nocontract_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
// Now compute the strides corresponding to the contracting dimensions. We
|
||||
// assumed above that non-contracting axes are represented in the same order
|
||||
// in the matrix as they are in the tensor. This is not the case for
|
||||
// contracting axes. As the contracting axes must be of the same size in
|
||||
// each tensor, we'll only look at the first tensor here.
|
||||
m_rhs_inner_dim_contiguous = true;
|
||||
m_rhs_inner_dim_reordered = false;
|
||||
for (int i = 0; i < ContractDims; i++) {
|
||||
Index left = eval_op_indices[i].first;
|
||||
Index right = eval_op_indices[i].second;
|
||||
|
||||
Index size = eval_left_dims[left];
|
||||
eigen_assert(size == eval_right_dims[right] &&
|
||||
"Contraction axes must be same size");
|
||||
|
||||
if (i+1 < static_cast<int>(internal::array_size<contract_t>::value)) {
|
||||
m_k_strides[i+1] = m_k_strides[i] * size;
|
||||
} else {
|
||||
m_k_size = m_k_strides[i] * size;
|
||||
}
|
||||
m_left_contracting_strides[i] = lhs_strides[left];
|
||||
m_right_contracting_strides[i] = rhs_strides[right];
|
||||
|
||||
if (i > 0 && right < eval_op_indices[i-1].second) {
|
||||
m_rhs_inner_dim_reordered = true;
|
||||
}
|
||||
if (right != i) {
|
||||
m_rhs_inner_dim_contiguous = false;
|
||||
}
|
||||
}
|
||||
|
||||
// If the layout is RowMajor, we need to reverse the m_dimensions
|
||||
if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
|
||||
for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
|
||||
numext::swap(m_dimensions[i], m_dimensions[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
|
||||
m_leftImpl.evalSubExprsIfNeeded(NULL);
|
||||
m_rightImpl.evalSubExprsIfNeeded(NULL);
|
||||
if (data) {
|
||||
evalTo(data);
|
||||
return false;
|
||||
} else {
|
||||
m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
|
||||
evalTo(m_result);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
|
||||
if (this->m_lhs_inner_dim_contiguous) {
|
||||
if (this->m_rhs_inner_dim_contiguous) {
|
||||
if (this->m_rhs_inner_dim_reordered) {
|
||||
static_cast<const Derived*>(this)->template evalProduct<true, true, true, Unaligned>(buffer);
|
||||
}
|
||||
else {
|
||||
static_cast<const Derived*>(this)->template evalProduct<true, true, false, Unaligned>(buffer);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (this->m_rhs_inner_dim_reordered) {
|
||||
static_cast<const Derived*>(this)->template evalProduct<true, false, true, Unaligned>(buffer);
|
||||
}
|
||||
else {
|
||||
static_cast<const Derived*>(this)->template evalProduct<true, false, false, Unaligned>(buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (this->m_rhs_inner_dim_contiguous) {
|
||||
if (this->m_rhs_inner_dim_reordered) {
|
||||
static_cast<const Derived*>(this)->template evalProduct<false, true, true, Unaligned>(buffer);
|
||||
}
|
||||
else {
|
||||
static_cast<const Derived*>(this)->template evalProduct<false, true, false, Unaligned>(buffer);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (this->m_rhs_inner_dim_reordered) {
|
||||
static_cast<const Derived*>(this)->template evalProduct<false, false, true, Unaligned>(buffer);
|
||||
}
|
||||
else {
|
||||
static_cast<const Derived*>(this)->template evalProduct<false, false, false, Unaligned>(buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
|
||||
EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const {
|
||||
const Index rows = m_i_size;
|
||||
const Index cols = m_k_size;
|
||||
|
||||
typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
|
||||
typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
|
||||
typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
|
||||
typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
|
||||
const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
|
||||
const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
|
||||
const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned;
|
||||
const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned;
|
||||
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
|
||||
LeftEvaluator, left_nocontract_t,
|
||||
contract_t, lhs_packet_size,
|
||||
lhs_inner_dim_contiguous,
|
||||
false, lhs_alignment> LhsMapper;
|
||||
|
||||
typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
|
||||
RightEvaluator, right_nocontract_t,
|
||||
contract_t, rhs_packet_size,
|
||||
rhs_inner_dim_contiguous,
|
||||
rhs_inner_dim_reordered, rhs_alignment> RhsMapper;
|
||||
|
||||
LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides,
|
||||
m_left_contracting_strides, m_k_strides);
|
||||
RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides,
|
||||
m_right_contracting_strides, m_k_strides);
|
||||
|
||||
const Scalar alpha(1);
|
||||
const Index resIncr(1);
|
||||
|
||||
// zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
|
||||
m_device.memset(buffer, 0, rows * sizeof(Scalar));
|
||||
|
||||
internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
|
||||
rows, cols, lhs, rhs,
|
||||
buffer, resIncr, alpha);
|
||||
}
|
||||
|
||||
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
|
||||
EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
|
||||
// columns in left side, rows in right side
|
||||
const Index k = this->m_k_size;
|
||||
|
||||
// rows in left side
|
||||
const Index m = this->m_i_size;
|
||||
|
||||
// columns in right side
|
||||
const Index n = this->m_j_size;
|
||||
|
||||
// zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
|
||||
this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
|
||||
|
||||
// define mr, nr, and all of my data mapper types
|
||||
typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
|
||||
typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
|
||||
typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
|
||||
|
||||
const Index nr = Traits::nr;
|
||||
const Index mr = Traits::mr;
|
||||
|
||||
typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
|
||||
typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
|
||||
|
||||
const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
|
||||
const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
|
||||
|
||||
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
|
||||
LeftEvaluator, left_nocontract_t,
|
||||
contract_t, lhs_packet_size,
|
||||
lhs_inner_dim_contiguous,
|
||||
false, Unaligned> LhsMapper;
|
||||
|
||||
typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
|
||||
RightEvaluator, right_nocontract_t,
|
||||
contract_t, rhs_packet_size,
|
||||
rhs_inner_dim_contiguous,
|
||||
rhs_inner_dim_reordered, Unaligned> RhsMapper;
|
||||
|
||||
typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
|
||||
|
||||
// Declare GEBP packing and kernel structs
|
||||
internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
|
||||
internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
|
||||
|
||||
internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
|
||||
|
||||
// initialize data mappers
|
||||
LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
|
||||
this->m_left_contracting_strides, this->m_k_strides);
|
||||
|
||||
RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
|
||||
this->m_right_contracting_strides, this->m_k_strides);
|
||||
|
||||
OutputMapper output(buffer, m);
|
||||
|
||||
// Sizes of the blocks to load in cache. See the Goto paper for details.
|
||||
internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1);
|
||||
const Index kc = blocking.kc();
|
||||
const Index mc = numext::mini(m, blocking.mc());
|
||||
const Index nc = numext::mini(n, blocking.nc());
|
||||
const Index sizeA = mc * kc;
|
||||
const Index sizeB = kc * nc;
|
||||
|
||||
LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
|
||||
RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
|
||||
|
||||
for(Index i2=0; i2<m; i2+=mc)
|
||||
{
|
||||
const Index actual_mc = numext::mini(i2+mc,m)-i2;
|
||||
for (Index k2 = 0; k2 < k; k2 += kc) {
|
||||
// make sure we don't overshoot right edge of left matrix, then pack vertical panel
|
||||
const Index actual_kc = numext::mini(k2 + kc, k) - k2;
|
||||
pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
|
||||
|
||||
// series of horizontal blocks
|
||||
for (Index j2 = 0; j2 < n; j2 += nc) {
|
||||
// make sure we don't overshoot right edge of right matrix, then pack block
|
||||
const Index actual_nc = numext::mini(j2 + nc, n) - j2;
|
||||
pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
|
||||
|
||||
// call gebp (matrix kernel)
|
||||
// The parameters here are copied from Eigen's GEMM implementation
|
||||
gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, Scalar(1), -1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this->m_device.deallocate(blockA);
|
||||
this->m_device.deallocate(blockB);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_leftImpl.cleanup();
|
||||
m_rightImpl.cleanup();
|
||||
|
||||
if (m_result != NULL) {
|
||||
m_device.deallocate(m_result);
|
||||
m_result = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
|
||||
return m_result[index];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
|
||||
|
||||
protected:
|
||||
// Prevent assignment
|
||||
TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
|
||||
Dimensions m_dimensions;
|
||||
|
||||
contract_t m_k_strides;
|
||||
contract_t m_left_contracting_strides;
|
||||
contract_t m_right_contracting_strides;
|
||||
|
||||
bool m_lhs_inner_dim_contiguous;
|
||||
bool m_rhs_inner_dim_contiguous;
|
||||
bool m_rhs_inner_dim_reordered;
|
||||
|
||||
left_nocontract_t m_i_strides;
|
||||
right_nocontract_t m_j_strides;
|
||||
left_nocontract_t m_left_nocontract_strides;
|
||||
right_nocontract_t m_right_nocontract_strides;
|
||||
|
||||
Index m_i_size;
|
||||
Index m_j_size;
|
||||
Index m_k_size;
|
||||
|
||||
TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
|
||||
TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
|
||||
const Device& m_device;
|
||||
Scalar* m_result;
|
||||
};
|
||||
|
||||
|
||||
// evaluator for default device
|
||||
template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
|
||||
public TensorContractionEvaluatorBase<
|
||||
TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
|
||||
typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
|
||||
typedef TensorContractionEvaluatorBase<Self> Base;
|
||||
|
||||
typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
|
||||
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
enum {
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout
|
||||
};
|
||||
|
||||
// Most of the code is assuming that both input tensors are ColMajor. If the
|
||||
// inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
|
||||
// If we want to compute A * B = C, where A is LHS and B is RHS, the code
|
||||
// will pretend B is LHS and A is RHS.
|
||||
typedef typename internal::conditional<
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
|
||||
typedef typename internal::conditional<
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
|
||||
|
||||
static const int LDims =
|
||||
internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
|
||||
static const int RDims =
|
||||
internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
|
||||
static const int ContractDims = internal::array_size<Indices>::value;
|
||||
|
||||
typedef array<Index, ContractDims> contract_t;
|
||||
typedef array<Index, LDims - ContractDims> left_nocontract_t;
|
||||
typedef array<Index, RDims - ContractDims> right_nocontract_t;
|
||||
|
||||
static const int NumDims = LDims + RDims - 2 * ContractDims;
|
||||
|
||||
// Could we use NumDimensions here?
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
|
||||
Base(op, device) { }
|
||||
|
||||
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
|
||||
EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const {
|
||||
if (this->m_j_size == 1) {
|
||||
this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
|
|
@ -0,0 +1,56 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
|
||||
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
enum {
|
||||
ShardByRow = 0,
|
||||
ShardByCol = 1
|
||||
};
|
||||
|
||||
|
||||
// Default Blocking Strategy
|
||||
template <typename LhsMapper, typename RhsMapper, typename Index, int ShardingType=ShardByCol>
|
||||
class TensorContractionBlocking {
|
||||
public:
|
||||
|
||||
typedef typename LhsMapper::Scalar LhsScalar;
|
||||
typedef typename RhsMapper::Scalar RhsScalar;
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
|
||||
kc_(k), mc_(m), nc_(n)
|
||||
{
|
||||
if (ShardingType == ShardByCol) {
|
||||
computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, mc_, nc_, num_threads);
|
||||
}
|
||||
else {
|
||||
computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
|
||||
|
||||
private:
|
||||
Index kc_;
|
||||
Index mc_;
|
||||
Index nc_;
|
||||
};
|
||||
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,469 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
enum {
|
||||
Rhs = 0,
|
||||
Lhs = 1
|
||||
};
|
||||
|
||||
/*
|
||||
* Implementation of the Eigen blas_data_mapper class for tensors.
|
||||
*/
|
||||
|
||||
template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
|
||||
enum {
|
||||
DirectOffsets = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) {
|
||||
eigen_assert(false && "unsupported");
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); }
|
||||
|
||||
template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
|
||||
{
|
||||
return m_tensor.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
const Tensor m_tensor;
|
||||
};
|
||||
|
||||
template <typename Tensor> struct CoeffLoader<Tensor, true> {
|
||||
enum {
|
||||
DirectOffsets = true
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
|
||||
m_data += offset;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); }
|
||||
|
||||
template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
|
||||
{
|
||||
return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
|
||||
}
|
||||
private:
|
||||
typedef typename Tensor::Scalar Scalar;
|
||||
const Scalar* m_data;
|
||||
};
|
||||
|
||||
template<typename Scalar, typename Index, int side,
|
||||
typename Tensor,
|
||||
typename nocontract_t, typename contract_t,
|
||||
int packet_size, bool inner_dim_contiguous, int Alignment>
|
||||
class SimpleTensorContractionMapper {
|
||||
public:
|
||||
EIGEN_DEVICE_FUNC
|
||||
SimpleTensorContractionMapper(const Tensor& tensor,
|
||||
const nocontract_t& nocontract_strides,
|
||||
const nocontract_t& ij_strides,
|
||||
const contract_t& contract_strides,
|
||||
const contract_t& k_strides) :
|
||||
m_tensor(tensor),
|
||||
m_nocontract_strides(nocontract_strides),
|
||||
m_ij_strides(ij_strides),
|
||||
m_contract_strides(contract_strides),
|
||||
m_k_strides(k_strides) { }
|
||||
|
||||
enum {
|
||||
DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess>::DirectOffsets
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
|
||||
m_tensor.offsetBuffer(offset);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
|
||||
// column major assumption
|
||||
return operator()(row, 0);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
|
||||
return m_tensor.coeff(computeIndex(row, col));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
|
||||
const bool left = (side == Lhs);
|
||||
EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: xxxps://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
|
||||
Index nocontract_val = left ? row : col;
|
||||
Index linidx = 0;
|
||||
for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
|
||||
const Index idx = nocontract_val / m_ij_strides[i];
|
||||
linidx += idx * m_nocontract_strides[i];
|
||||
nocontract_val -= idx * m_ij_strides[i];
|
||||
}
|
||||
if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
|
||||
if (side == Lhs && inner_dim_contiguous) {
|
||||
eigen_assert(m_nocontract_strides[0] == 1);
|
||||
linidx += nocontract_val;
|
||||
} else {
|
||||
linidx += nocontract_val * m_nocontract_strides[0];
|
||||
}
|
||||
}
|
||||
|
||||
Index contract_val = left ? col : row;
|
||||
if(array_size<contract_t>::value > 0) {
|
||||
for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
|
||||
const Index idx = contract_val / m_k_strides[i];
|
||||
linidx += idx * m_contract_strides[i];
|
||||
contract_val -= idx * m_k_strides[i];
|
||||
}
|
||||
|
||||
if (side == Rhs && inner_dim_contiguous) {
|
||||
eigen_assert(m_contract_strides[0] == 1);
|
||||
linidx += contract_val;
|
||||
} else {
|
||||
linidx += contract_val * m_contract_strides[0];
|
||||
}
|
||||
}
|
||||
|
||||
return linidx;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
|
||||
const bool left = (side == Lhs);
|
||||
EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: xxxps://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
|
||||
Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
|
||||
Index linidx[2] = {0, 0};
|
||||
if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
|
||||
for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
|
||||
const Index idx0 = nocontract_val[0] / m_ij_strides[i];
|
||||
const Index idx1 = nocontract_val[1] / m_ij_strides[i];
|
||||
linidx[0] += idx0 * m_nocontract_strides[i];
|
||||
linidx[1] += idx1 * m_nocontract_strides[i];
|
||||
nocontract_val[0] -= idx0 * m_ij_strides[i];
|
||||
nocontract_val[1] -= idx1 * m_ij_strides[i];
|
||||
}
|
||||
if (side == Lhs && inner_dim_contiguous) {
|
||||
eigen_assert(m_nocontract_strides[0] == 1);
|
||||
linidx[0] += nocontract_val[0];
|
||||
linidx[1] += nocontract_val[1];
|
||||
} else {
|
||||
linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
|
||||
linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
|
||||
}
|
||||
}
|
||||
|
||||
Index contract_val[2] = {left ? col : row, left ? col : row + distance};
|
||||
if (array_size<contract_t>::value> 0) {
|
||||
for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
|
||||
const Index idx0 = contract_val[0] / m_k_strides[i];
|
||||
const Index idx1 = contract_val[1] / m_k_strides[i];
|
||||
linidx[0] += idx0 * m_contract_strides[i];
|
||||
linidx[1] += idx1 * m_contract_strides[i];
|
||||
contract_val[0] -= idx0 * m_k_strides[i];
|
||||
contract_val[1] -= idx1 * m_k_strides[i];
|
||||
}
|
||||
|
||||
if (side == Rhs && inner_dim_contiguous) {
|
||||
eigen_assert(m_contract_strides[0] == 1);
|
||||
linidx[0] += contract_val[0];
|
||||
linidx[1] += contract_val[1];
|
||||
} else {
|
||||
linidx[0] += contract_val[0] * m_contract_strides[0];
|
||||
linidx[1] += contract_val[1] * m_contract_strides[0];
|
||||
}
|
||||
}
|
||||
return IndexPair<Index>(linidx[0], linidx[1]);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const {
|
||||
// Only claim alignment when we can compute the actual stride (ie when we're
|
||||
// dealing with the lhs with inner_dim_contiguous. This is because the
|
||||
// matrix-vector product relies on the stride when dealing with aligned inputs.
|
||||
return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
|
||||
return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
|
||||
}
|
||||
|
||||
protected:
|
||||
CoeffLoader<Tensor, Tensor::RawAccess> m_tensor;
|
||||
const nocontract_t m_nocontract_strides;
|
||||
const nocontract_t m_ij_strides;
|
||||
const contract_t m_contract_strides;
|
||||
const contract_t m_k_strides;
|
||||
};
|
||||
|
||||
|
||||
template<typename Scalar, typename Index, int side,
|
||||
typename Tensor,
|
||||
typename nocontract_t, typename contract_t,
|
||||
int packet_size, bool inner_dim_contiguous,
|
||||
bool inner_dim_reordered, int Alignment>
|
||||
class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment>
|
||||
{
|
||||
public:
|
||||
typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
BaseTensorContractionMapper(const Tensor& tensor,
|
||||
const nocontract_t& nocontract_strides,
|
||||
const nocontract_t& ij_strides,
|
||||
const contract_t& contract_strides,
|
||||
const contract_t& k_strides) :
|
||||
ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
|
||||
|
||||
typedef typename Tensor::PacketReturnType Packet;
|
||||
typedef typename unpacket_traits<Packet>::half HalfPacket;
|
||||
|
||||
template <int AlignmentType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
|
||||
// whole method makes column major assumption
|
||||
|
||||
// don't need to add offsets for now (because operator handles that)
|
||||
// current code assumes packet size must be a multiple of 2
|
||||
EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
|
||||
if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
|
||||
const Index index = this->computeIndex(i, j);
|
||||
eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
|
||||
return this->m_tensor.template packet<AlignmentType>(index);
|
||||
}
|
||||
|
||||
const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
|
||||
const Index first = indexPair.first;
|
||||
const Index last = indexPair.second;
|
||||
|
||||
// We can always do optimized packet reads from left hand side right now, because
|
||||
// the vertical matrix dimension on the left hand side is never contracting.
|
||||
// On the right hand side we need to check if the contracting dimensions may have
|
||||
// been shuffled first.
|
||||
if (Tensor::PacketAccess &&
|
||||
(side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
|
||||
(last - first) == (packet_size - 1)) {
|
||||
|
||||
return this->m_tensor.template packet<AlignmentType>(first);
|
||||
}
|
||||
|
||||
EIGEN_ALIGN_MAX Scalar data[packet_size];
|
||||
|
||||
data[0] = this->m_tensor.coeff(first);
|
||||
for (Index k = 1; k < packet_size - 1; k += 2) {
|
||||
const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
|
||||
data[k] = this->m_tensor.coeff(internal_pair.first);
|
||||
data[k + 1] = this->m_tensor.coeff(internal_pair.second);
|
||||
}
|
||||
data[packet_size - 1] = this->m_tensor.coeff(last);
|
||||
|
||||
return pload<Packet>(data);
|
||||
}
|
||||
|
||||
template <int AlignmentType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
|
||||
// whole method makes column major assumption
|
||||
|
||||
// don't need to add offsets for now (because operator handles that)
|
||||
const Index half_packet_size = unpacket_traits<HalfPacket>::size;
|
||||
if (half_packet_size == packet_size) {
|
||||
return loadPacket<AlignmentType>(i, j);
|
||||
}
|
||||
EIGEN_ALIGN_MAX Scalar data[half_packet_size];
|
||||
for (Index k = 0; k < half_packet_size; k++) {
|
||||
data[k] = operator()(i + k, j);
|
||||
}
|
||||
return pload<HalfPacket>(data);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<typename Scalar, typename Index, int side,
|
||||
typename Tensor,
|
||||
typename nocontract_t, typename contract_t,
|
||||
bool inner_dim_contiguous,
|
||||
bool inner_dim_reordered, int Alignment>
|
||||
class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment>
|
||||
{
|
||||
public:
|
||||
typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
BaseTensorContractionMapper(const Tensor& tensor,
|
||||
const nocontract_t& nocontract_strides,
|
||||
const nocontract_t& ij_strides,
|
||||
const contract_t& contract_strides,
|
||||
const contract_t& k_strides) :
|
||||
ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
|
||||
|
||||
typedef typename Tensor::PacketReturnType Packet;
|
||||
template <int> EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
|
||||
EIGEN_ALIGN_MAX Scalar data[1];
|
||||
data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
|
||||
return pload<typename Tensor::PacketReturnType>(data);
|
||||
}
|
||||
template <int> EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
|
||||
return loadPacket(i, j);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<typename Scalar, typename Index, int side,
|
||||
typename Tensor,
|
||||
typename nocontract_t, typename contract_t,
|
||||
int packet_size,
|
||||
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
|
||||
class TensorContractionSubMapper {
|
||||
public:
|
||||
typedef typename Tensor::PacketReturnType Packet;
|
||||
typedef typename unpacket_traits<Packet>::half HalfPacket;
|
||||
|
||||
typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
|
||||
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
|
||||
typedef Self LinearMapper;
|
||||
|
||||
enum {
|
||||
// We can use direct offsets iff the parent mapper supports then and we can compute the strides.
|
||||
// TODO: we should also enable direct offsets for the Rhs case.
|
||||
UseDirectOffsets = ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size<contract_t>::value > 0)
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
|
||||
: m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) {
|
||||
// Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute
|
||||
// this offset every time we attempt to access a coefficient.
|
||||
if (UseDirectOffsets) {
|
||||
Index stride = m_base_mapper.stride();
|
||||
m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride);
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
|
||||
if (UseDirectOffsets) {
|
||||
return m_base_mapper(i, 0);
|
||||
}
|
||||
return m_base_mapper(i + m_vert_offset, m_horiz_offset);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
|
||||
if (UseDirectOffsets) {
|
||||
return m_base_mapper(i, j);
|
||||
}
|
||||
return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
|
||||
if (UseDirectOffsets) {
|
||||
return m_base_mapper.template loadPacket<Alignment>(i, 0);
|
||||
}
|
||||
return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
|
||||
if (UseDirectOffsets) {
|
||||
return m_base_mapper.template loadPacket<Alignment>(i, j);
|
||||
}
|
||||
return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
|
||||
if (UseDirectOffsets) {
|
||||
return m_base_mapper.template loadHalfPacket<Alignment>(i, 0);
|
||||
}
|
||||
return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
|
||||
if (UseDirectOffsets) {
|
||||
m_base_mapper.storePacket(i, 0, p);
|
||||
}
|
||||
m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
|
||||
if (UseDirectOffsets) {
|
||||
return LinearMapper(m_base_mapper, i, j);
|
||||
}
|
||||
return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
|
||||
}
|
||||
|
||||
template <typename PacketT, int AlignmentType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
|
||||
EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
|
||||
if (UseDirectOffsets) {
|
||||
return m_base_mapper.template loadPacket<ActualAlignment>(i, 0);
|
||||
}
|
||||
return m_base_mapper.template loadPacket<ActualAlignment>(i + m_vert_offset, m_horiz_offset);
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
private:
|
||||
ParentMapper m_base_mapper;
|
||||
const Index m_vert_offset;
|
||||
const Index m_horiz_offset;
|
||||
};
|
||||
|
||||
|
||||
template<typename Scalar_, typename Index, int side,
|
||||
typename Tensor,
|
||||
typename nocontract_t, typename contract_t,
|
||||
int packet_size,
|
||||
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
|
||||
class TensorContractionInputMapper
|
||||
: public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
|
||||
|
||||
public:
|
||||
typedef Scalar_ Scalar;
|
||||
typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
|
||||
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
|
||||
typedef SubMapper VectorMapper;
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
|
||||
const nocontract_t& nocontract_strides,
|
||||
const nocontract_t& ij_strides,
|
||||
const contract_t& contract_strides,
|
||||
const contract_t& k_strides)
|
||||
: Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
|
||||
return SubMapper(*this, i, j);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
|
||||
return VectorMapper(*this, i, j);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,279 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorConversionOp
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor conversion class. This class makes it possible to vectorize
|
||||
* type casting operations when the number of scalars per packet in the source
|
||||
* and the destination type differ
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename TargetType, typename XprType>
|
||||
struct traits<TensorConversionOp<TargetType, XprType> >
|
||||
{
|
||||
// Type promotion to handle the case where the types of the lhs and the rhs are different.
|
||||
typedef TargetType Scalar;
|
||||
typedef typename traits<XprType>::StorageKind StorageKind;
|
||||
typedef typename traits<XprType>::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = traits<XprType>::NumDimensions;
|
||||
static const int Layout = traits<XprType>::Layout;
|
||||
enum { Flags = 0 };
|
||||
};
|
||||
|
||||
template<typename TargetType, typename XprType>
|
||||
struct eval<TensorConversionOp<TargetType, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorConversionOp<TargetType, XprType>& type;
|
||||
};
|
||||
|
||||
template<typename TargetType, typename XprType>
|
||||
struct nested<TensorConversionOp<TargetType, XprType>, 1, typename eval<TensorConversionOp<TargetType, XprType> >::type>
|
||||
{
|
||||
typedef TensorConversionOp<TargetType, XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
|
||||
struct PacketConverter {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
PacketConverter(const TensorEvaluator& impl)
|
||||
: m_impl(impl) {}
|
||||
|
||||
template<int LoadMode, typename Index>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
|
||||
return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<LoadMode>(index));
|
||||
}
|
||||
|
||||
private:
|
||||
const TensorEvaluator& m_impl;
|
||||
};
|
||||
|
||||
|
||||
template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
|
||||
struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 2, 1> {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
PacketConverter(const TensorEvaluator& impl)
|
||||
: m_impl(impl) {}
|
||||
|
||||
template<int LoadMode, typename Index>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
|
||||
const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
|
||||
|
||||
SrcPacket src1 = m_impl.template packet<LoadMode>(index);
|
||||
SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
|
||||
TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2);
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
const TensorEvaluator& m_impl;
|
||||
};
|
||||
|
||||
template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
|
||||
struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 4, 1> {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
PacketConverter(const TensorEvaluator& impl)
|
||||
: m_impl(impl) {}
|
||||
|
||||
template<int LoadMode, typename Index>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
|
||||
const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
|
||||
|
||||
SrcPacket src1 = m_impl.template packet<LoadMode>(index);
|
||||
SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
|
||||
SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
|
||||
SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
|
||||
TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4);
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
const TensorEvaluator& m_impl;
|
||||
};
|
||||
|
||||
template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
|
||||
struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
PacketConverter(const TensorEvaluator& impl)
|
||||
: m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {}
|
||||
|
||||
template<int LoadMode, typename Index>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
|
||||
const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
|
||||
// Only call m_impl.packet() when we have direct access to the underlying data. This
|
||||
// ensures that we don't compute the subexpression twice. We may however load some
|
||||
// coefficients twice, but in practice this doesn't negatively impact performance.
|
||||
if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) {
|
||||
// Force unaligned memory loads since we can't ensure alignment anymore
|
||||
return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<Unaligned>(index));
|
||||
} else {
|
||||
const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
|
||||
typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
|
||||
typedef typename internal::unpacket_traits<TgtPacket>::type TgtType;
|
||||
internal::scalar_cast_op<SrcType, TgtType> converter;
|
||||
EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
|
||||
for (int i = 0; i < TgtPacketSize; ++i) {
|
||||
values[i] = converter(m_impl.coeff(index+i));
|
||||
}
|
||||
TgtPacket rslt = internal::pload<TgtPacket>(values);
|
||||
return rslt;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const TensorEvaluator& m_impl;
|
||||
const typename TensorEvaluator::Index m_maxIndex;
|
||||
};
|
||||
|
||||
template<typename TargetType, typename XprType>
|
||||
class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename internal::traits<TensorConversionOp>::Scalar Scalar;
|
||||
typedef typename internal::traits<TensorConversionOp>::StorageKind StorageKind;
|
||||
typedef typename internal::traits<TensorConversionOp>::Index Index;
|
||||
typedef typename internal::nested<TensorConversionOp>::type Nested;
|
||||
typedef Scalar CoeffReturnType;
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr)
|
||||
: m_xpr(xpr) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
};
|
||||
|
||||
template <bool SameType, typename Eval, typename Scalar> struct ConversionSubExprEval {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) {
|
||||
impl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eval, Scalar> {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) {
|
||||
return impl.evalSubExprsIfNeeded(data);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename TargetType, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
|
||||
{
|
||||
typedef TensorConversionOp<TargetType, ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
|
||||
typedef TargetType Scalar;
|
||||
typedef TargetType CoeffReturnType;
|
||||
typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef typename PacketType<SrcType, Device>::type PacketSourceType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = true,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device)
|
||||
{
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data)
|
||||
{
|
||||
return ConversionSubExprEval<internal::is_same<TargetType, SrcType>::value, TensorEvaluator<ArgType, Device>, Scalar>::run(m_impl, data);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
|
||||
{
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
internal::scalar_cast_op<SrcType, TargetType> converter;
|
||||
return converter(m_impl.coeff(index));
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const bool Vectorizable = TensorEvaluator<ArgType, Device>::PacketAccess &
|
||||
internal::type_casting_traits<SrcType, TargetType>::VectorizedCast;
|
||||
return PacketConv<LoadMode, Vectorizable>::run(m_impl, index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
const double cast_cost = TensorOpCost::CastCost<SrcType, TargetType>();
|
||||
if (vectorized) {
|
||||
const double SrcCoeffRatio =
|
||||
internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
|
||||
const double TgtCoeffRatio =
|
||||
internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
|
||||
return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) +
|
||||
TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize));
|
||||
} else {
|
||||
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost);
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
template <int LoadMode, bool ActuallyVectorize>
|
||||
struct PacketConv {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
|
||||
internal::scalar_cast_op<SrcType, TargetType> converter;
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = converter(impl.coeff(index+i));
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
};
|
||||
|
||||
template <int LoadMode>
|
||||
struct PacketConv<LoadMode, true> {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
|
||||
const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
|
||||
const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
|
||||
PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
|
||||
SrcCoeffRatio, TgtCoeffRatio> converter(impl);
|
||||
return converter.template packet<LoadMode>(index);
|
||||
}
|
||||
};
|
||||
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,212 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorEvaluator
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief A cost model used to limit the number of threads used for evaluating
|
||||
* tensor expression.
|
||||
*
|
||||
*/
|
||||
|
||||
// Class storing the cost of evaluating a tensor expression in terms of the
|
||||
// estimated number of operand bytes loads, bytes stored, and compute cycles.
|
||||
class TensorOpCost {
|
||||
public:
|
||||
// TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
|
||||
// model based on minimal reciprocal throughput numbers from Intel or
|
||||
// Agner Fog's tables would be better than what is there now.
|
||||
template <typename ArgType>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
|
||||
return internal::functor_traits<
|
||||
internal::scalar_product_op<ArgType, ArgType> >::Cost;
|
||||
}
|
||||
template <typename ArgType>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
|
||||
return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
|
||||
}
|
||||
template <typename ArgType>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
|
||||
return internal::functor_traits<
|
||||
internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
|
||||
}
|
||||
template <typename ArgType>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
|
||||
return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
|
||||
}
|
||||
template <typename SrcType, typename TargetType>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
|
||||
return internal::functor_traits<
|
||||
internal::scalar_cast_op<SrcType, TargetType> >::Cost;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
|
||||
EIGEN_DEVICE_FUNC
|
||||
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
|
||||
: bytes_loaded_(bytes_loaded),
|
||||
bytes_stored_(bytes_stored),
|
||||
compute_cycles_(compute_cycles) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
|
||||
bool vectorized, double packet_size)
|
||||
: bytes_loaded_(bytes_loaded),
|
||||
bytes_stored_(bytes_stored),
|
||||
compute_cycles_(vectorized ? compute_cycles / packet_size
|
||||
: compute_cycles) {
|
||||
eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
|
||||
eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
|
||||
eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
|
||||
return bytes_loaded_;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
|
||||
return bytes_stored_;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
|
||||
return compute_cycles_;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
|
||||
double load_cost, double store_cost, double compute_cost) const {
|
||||
return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
|
||||
compute_cost * compute_cycles_;
|
||||
}
|
||||
|
||||
// Drop memory access component. Intended for cases when memory accesses are
|
||||
// sequential or are completely masked by computations.
|
||||
EIGEN_DEVICE_FUNC void dropMemoryCost() {
|
||||
bytes_loaded_ = 0;
|
||||
bytes_stored_ = 0;
|
||||
}
|
||||
|
||||
// TODO(rmlarsen): Define min in terms of total cost, not elementwise.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
|
||||
const TensorOpCost& rhs) const {
|
||||
double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
|
||||
double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
|
||||
double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
|
||||
return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
|
||||
}
|
||||
|
||||
// TODO(rmlarsen): Define max in terms of total cost, not elementwise.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
|
||||
const TensorOpCost& rhs) const {
|
||||
double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
|
||||
double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
|
||||
double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
|
||||
return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
|
||||
const TensorOpCost& rhs) {
|
||||
bytes_loaded_ += rhs.bytes_loaded();
|
||||
bytes_stored_ += rhs.bytes_stored();
|
||||
compute_cycles_ += rhs.compute_cycles();
|
||||
return *this;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
|
||||
bytes_loaded_ *= rhs;
|
||||
bytes_stored_ *= rhs;
|
||||
compute_cycles_ *= rhs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
|
||||
TensorOpCost lhs, const TensorOpCost& rhs) {
|
||||
lhs += rhs;
|
||||
return lhs;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
|
||||
TensorOpCost lhs, double rhs) {
|
||||
lhs *= rhs;
|
||||
return lhs;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
|
||||
double lhs, TensorOpCost rhs) {
|
||||
rhs *= lhs;
|
||||
return rhs;
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
|
||||
return os << "[bytes_loaded = " << tc.bytes_loaded()
|
||||
<< ", bytes_stored = " << tc.bytes_stored()
|
||||
<< ", compute_cycles = " << tc.compute_cycles() << "]";
|
||||
}
|
||||
|
||||
private:
|
||||
double bytes_loaded_;
|
||||
double bytes_stored_;
|
||||
double compute_cycles_;
|
||||
};
|
||||
|
||||
// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
|
||||
// in [1:max_threads] instead of just switching multi-threading off for small
|
||||
// work units.
|
||||
template <typename Device>
|
||||
class TensorCostModel {
|
||||
public:
|
||||
// Scaling from Eigen compute cost to device cycles.
|
||||
static const int kDeviceCyclesPerComputeCycle = 1;
|
||||
|
||||
// Costs in device cycles.
|
||||
static const int kStartupCycles = 100000;
|
||||
static const int kPerThreadCycles = 100000;
|
||||
static const int kTaskSize = 40000;
|
||||
|
||||
// Returns the number of threads in [1:max_threads] to use for
|
||||
// evaluating an expression with the given output size and cost per
|
||||
// coefficient.
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
|
||||
double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
|
||||
double cost = totalCost(output_size, cost_per_coeff);
|
||||
int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
|
||||
return numext::mini(max_threads, numext::maxi(1, threads));
|
||||
}
|
||||
|
||||
// taskSize assesses parallel task size.
|
||||
// Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
|
||||
// granularity needs to be increased to mitigate parallelization overheads.
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
|
||||
double output_size, const TensorOpCost& cost_per_coeff) {
|
||||
return totalCost(output_size, cost_per_coeff) / kTaskSize;
|
||||
}
|
||||
|
||||
private:
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
|
||||
double output_size, const TensorOpCost& cost_per_coeff) {
|
||||
// Cost of memory fetches from L2 cache. 64 is typical cache line size.
|
||||
// 11 is L2 cache latency on Haswell.
|
||||
// We don't know whether data is in L1, L2 or L3. But we are most interested
|
||||
// in single-threaded computational time around 100us-10ms (smaller time
|
||||
// is too small for parallelization, larger time is not intersting
|
||||
// either because we are probably using all available threads already).
|
||||
// And for the target time range, L2 seems to be what matters. Data set
|
||||
// fitting into L1 is too small to take noticeable time. Data set fitting
|
||||
// only into L3 presumably will take more than 10ms to load and process.
|
||||
const double kLoadCycles = 1.0 / 64 * 11;
|
||||
const double kStoreCycles = 1.0 / 64 * 11;
|
||||
// Scaling from Eigen compute cost to device cycles.
|
||||
return output_size *
|
||||
cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
|
||||
kDeviceCyclesPerComputeCycle);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
|
@ -0,0 +1,313 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorCustomUnaryOp
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor custom class.
|
||||
*
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename CustomUnaryFunc, typename XprType>
|
||||
struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
|
||||
{
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::StorageKind StorageKind;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = traits<XprType>::NumDimensions;
|
||||
static const int Layout = traits<XprType>::Layout;
|
||||
};
|
||||
|
||||
template<typename CustomUnaryFunc, typename XprType>
|
||||
struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>& type;
|
||||
};
|
||||
|
||||
template<typename CustomUnaryFunc, typename XprType>
|
||||
struct nested<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
|
||||
{
|
||||
typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename CustomUnaryFunc, typename XprType>
|
||||
class TensorCustomUnaryOp : public TensorBase<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename internal::traits<TensorCustomUnaryOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename internal::nested<TensorCustomUnaryOp>::type Nested;
|
||||
typedef typename internal::traits<TensorCustomUnaryOp>::StorageKind StorageKind;
|
||||
typedef typename internal::traits<TensorCustomUnaryOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func)
|
||||
: m_expr(expr), m_func(func) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const CustomUnaryFunc& func() const { return m_func; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_expr; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_expr;
|
||||
const CustomUnaryFunc m_func;
|
||||
};
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename CustomUnaryFunc, typename XprType, typename Device>
|
||||
struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Device>
|
||||
{
|
||||
typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> ArgType;
|
||||
typedef typename internal::traits<ArgType>::Index Index;
|
||||
static const int NumDims = internal::traits<ArgType>::NumDimensions;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<XprType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
|
||||
: m_op(op), m_device(device), m_result(NULL)
|
||||
{
|
||||
m_dimensions = op.func().dimensions(op.expression());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
|
||||
if (data) {
|
||||
evalTo(data);
|
||||
return false;
|
||||
} else {
|
||||
m_result = static_cast<CoeffReturnType*>(
|
||||
m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
|
||||
evalTo(m_result);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
if (m_result != NULL) {
|
||||
m_device.deallocate(m_result);
|
||||
m_result = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
|
||||
return m_result[index];
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
// TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
|
||||
TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(
|
||||
data, m_dimensions);
|
||||
m_op.func().eval(m_op.expression(), result, m_device);
|
||||
}
|
||||
|
||||
Dimensions m_dimensions;
|
||||
const ArgType m_op;
|
||||
const Device& m_device;
|
||||
CoeffReturnType* m_result;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/** \class TensorCustomBinaryOp
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor custom class.
|
||||
*
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
|
||||
struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
|
||||
{
|
||||
typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
|
||||
typename RhsXprType::Scalar>::ret Scalar;
|
||||
typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
|
||||
typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
|
||||
typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
|
||||
typename traits<RhsXprType>::StorageKind>::ret StorageKind;
|
||||
typedef typename promote_index_type<typename traits<LhsXprType>::Index,
|
||||
typename traits<RhsXprType>::Index>::type Index;
|
||||
typedef typename LhsXprType::Nested LhsNested;
|
||||
typedef typename RhsXprType::Nested RhsNested;
|
||||
typedef typename remove_reference<LhsNested>::type _LhsNested;
|
||||
typedef typename remove_reference<RhsNested>::type _RhsNested;
|
||||
static const int NumDimensions = traits<LhsXprType>::NumDimensions;
|
||||
static const int Layout = traits<LhsXprType>::Layout;
|
||||
};
|
||||
|
||||
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
|
||||
struct eval<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>& type;
|
||||
};
|
||||
|
||||
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
|
||||
struct nested<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
|
||||
{
|
||||
typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
|
||||
class TensorCustomBinaryOp : public TensorBase<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename internal::traits<TensorCustomBinaryOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename internal::traits<TensorCustomBinaryOp>::CoeffReturnType CoeffReturnType;
|
||||
typedef typename internal::nested<TensorCustomBinaryOp>::type Nested;
|
||||
typedef typename internal::traits<TensorCustomBinaryOp>::StorageKind StorageKind;
|
||||
typedef typename internal::traits<TensorCustomBinaryOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const CustomBinaryFunc& func)
|
||||
|
||||
: m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const CustomBinaryFunc& func() const { return m_func; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename LhsXprType::Nested>::type&
|
||||
lhsExpression() const { return m_lhs_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename RhsXprType::Nested>::type&
|
||||
rhsExpression() const { return m_rhs_xpr; }
|
||||
|
||||
protected:
|
||||
typename LhsXprType::Nested m_lhs_xpr;
|
||||
typename RhsXprType::Nested m_rhs_xpr;
|
||||
const CustomBinaryFunc m_func;
|
||||
};
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, typename Device>
|
||||
struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Device>
|
||||
{
|
||||
typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> XprType;
|
||||
typedef typename internal::traits<XprType>::Index Index;
|
||||
static const int NumDims = internal::traits<XprType>::NumDimensions;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<LhsXprType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_op(op), m_device(device), m_result(NULL)
|
||||
{
|
||||
m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
|
||||
if (data) {
|
||||
evalTo(data);
|
||||
return false;
|
||||
} else {
|
||||
m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
|
||||
evalTo(m_result);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
if (m_result != NULL) {
|
||||
m_device.deallocate(m_result);
|
||||
m_result = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
|
||||
return m_result[index];
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
// TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
|
||||
TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions);
|
||||
m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
|
||||
}
|
||||
|
||||
Dimensions m_dimensions;
|
||||
const XprType m_op;
|
||||
const Device& m_device;
|
||||
CoeffReturnType* m_result;
|
||||
};
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
|
|
@ -0,0 +1,68 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorDevice
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Pseudo expression providing an operator = that will evaluate its argument
|
||||
* on the specified computing 'device' (GPU, thread pool, ...)
|
||||
*
|
||||
* Example:
|
||||
* C.device(EIGEN_GPU) = A + B;
|
||||
*
|
||||
* Todo: operator *= and /=.
|
||||
*/
|
||||
|
||||
template <typename ExpressionType, typename DeviceType> class TensorDevice {
|
||||
public:
|
||||
TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
|
||||
typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
|
||||
Assign assign(m_expression, other);
|
||||
internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
|
||||
typedef typename OtherDerived::Scalar Scalar;
|
||||
typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
|
||||
Sum sum(m_expression, other);
|
||||
typedef TensorAssignOp<ExpressionType, const Sum> Assign;
|
||||
Assign assign(m_expression, sum);
|
||||
internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
|
||||
typedef typename OtherDerived::Scalar Scalar;
|
||||
typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
|
||||
Difference difference(m_expression, other);
|
||||
typedef TensorAssignOp<ExpressionType, const Difference> Assign;
|
||||
Assign assign(m_expression, difference);
|
||||
internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
|
||||
return *this;
|
||||
}
|
||||
|
||||
protected:
|
||||
const DeviceType& m_device;
|
||||
ExpressionType& m_expression;
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
|
|
@ -0,0 +1,337 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
static const int kCudaScratchSize = 1024;
|
||||
|
||||
// This defines an interface that GPUDevice can take to use
|
||||
// CUDA streams underneath.
|
||||
class StreamInterface {
|
||||
public:
|
||||
virtual ~StreamInterface() {}
|
||||
|
||||
virtual const cudaStream_t& stream() const = 0;
|
||||
virtual const cudaDeviceProp& deviceProperties() const = 0;
|
||||
|
||||
// Allocate memory on the actual device where the computation will run
|
||||
virtual void* allocate(size_t num_bytes) const = 0;
|
||||
virtual void deallocate(void* buffer) const = 0;
|
||||
|
||||
// Return a scratchpad buffer of size 1k
|
||||
virtual void* scratchpad() const = 0;
|
||||
|
||||
// Return a semaphore. The semaphore is initially initialized to 0, and
|
||||
// each kernel using it is responsible for resetting to 0 upon completion
|
||||
// to maintain the invariant that the semaphore is always equal to 0 upon
|
||||
// each kernel start.
|
||||
virtual unsigned int* semaphore() const = 0;
|
||||
};
|
||||
|
||||
static cudaDeviceProp* m_deviceProperties;
|
||||
static bool m_devicePropInitialized = false;
|
||||
|
||||
static void initializeDeviceProp() {
|
||||
if (!m_devicePropInitialized) {
|
||||
// Attempts to ensure proper behavior in the case of multiple threads
|
||||
// calling this function simultaneously. This would be trivial to
|
||||
// implement if we could use std::mutex, but unfortunately mutex don't
|
||||
// compile with nvcc, so we resort to atomics and thread fences instead.
|
||||
// Note that if the caller uses a compiler that doesn't support c++11 we
|
||||
// can't ensure that the initialization is thread safe.
|
||||
#if __cplusplus >= 201103L
|
||||
static std::atomic<bool> first(true);
|
||||
if (first.exchange(false)) {
|
||||
#else
|
||||
static bool first = true;
|
||||
if (first) {
|
||||
first = false;
|
||||
#endif
|
||||
// We're the first thread to reach this point.
|
||||
int num_devices;
|
||||
cudaError_t status = cudaGetDeviceCount(&num_devices);
|
||||
if (status != cudaSuccess) {
|
||||
std::cerr << "Failed to get the number of CUDA devices: "
|
||||
<< cudaGetErrorString(status)
|
||||
<< std::endl;
|
||||
assert(status == cudaSuccess);
|
||||
}
|
||||
m_deviceProperties = new cudaDeviceProp[num_devices];
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
|
||||
if (status != cudaSuccess) {
|
||||
std::cerr << "Failed to initialize CUDA device #"
|
||||
<< i
|
||||
<< ": "
|
||||
<< cudaGetErrorString(status)
|
||||
<< std::endl;
|
||||
assert(status == cudaSuccess);
|
||||
}
|
||||
}
|
||||
|
||||
#if __cplusplus >= 201103L
|
||||
std::atomic_thread_fence(std::memory_order_release);
|
||||
#endif
|
||||
m_devicePropInitialized = true;
|
||||
} else {
|
||||
// Wait for the other thread to inititialize the properties.
|
||||
while (!m_devicePropInitialized) {
|
||||
#if __cplusplus >= 201103L
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
#endif
|
||||
sleep(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const cudaStream_t default_stream = cudaStreamDefault;
|
||||
|
||||
class CudaStreamDevice : public StreamInterface {
|
||||
public:
|
||||
// Use the default stream on the current device
|
||||
CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
|
||||
cudaGetDevice(&device_);
|
||||
initializeDeviceProp();
|
||||
}
|
||||
// Use the default stream on the specified device
|
||||
CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
|
||||
initializeDeviceProp();
|
||||
}
|
||||
// Use the specified stream. Note that it's the
|
||||
// caller responsibility to ensure that the stream can run on
|
||||
// the specified device. If no device is specified the code
|
||||
// assumes that the stream is associated to the current gpu device.
|
||||
CudaStreamDevice(const cudaStream_t* stream, int device = -1)
|
||||
: stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
|
||||
if (device < 0) {
|
||||
cudaGetDevice(&device_);
|
||||
} else {
|
||||
int num_devices;
|
||||
cudaError_t err = cudaGetDeviceCount(&num_devices);
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
assert(device < num_devices);
|
||||
device_ = device;
|
||||
}
|
||||
initializeDeviceProp();
|
||||
}
|
||||
|
||||
virtual ~CudaStreamDevice() {
|
||||
if (scratch_) {
|
||||
deallocate(scratch_);
|
||||
}
|
||||
}
|
||||
|
||||
const cudaStream_t& stream() const { return *stream_; }
|
||||
const cudaDeviceProp& deviceProperties() const {
|
||||
return m_deviceProperties[device_];
|
||||
}
|
||||
virtual void* allocate(size_t num_bytes) const {
|
||||
cudaError_t err = cudaSetDevice(device_);
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
void* result;
|
||||
err = cudaMalloc(&result, num_bytes);
|
||||
assert(err == cudaSuccess);
|
||||
assert(result != NULL);
|
||||
return result;
|
||||
}
|
||||
virtual void deallocate(void* buffer) const {
|
||||
cudaError_t err = cudaSetDevice(device_);
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
assert(buffer != NULL);
|
||||
err = cudaFree(buffer);
|
||||
assert(err == cudaSuccess);
|
||||
}
|
||||
|
||||
virtual void* scratchpad() const {
|
||||
if (scratch_ == NULL) {
|
||||
scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
|
||||
}
|
||||
return scratch_;
|
||||
}
|
||||
|
||||
virtual unsigned int* semaphore() const {
|
||||
if (semaphore_ == NULL) {
|
||||
char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
|
||||
semaphore_ = reinterpret_cast<unsigned int*>(scratch);
|
||||
cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
}
|
||||
return semaphore_;
|
||||
}
|
||||
|
||||
private:
|
||||
const cudaStream_t* stream_;
|
||||
int device_;
|
||||
mutable void* scratch_;
|
||||
mutable unsigned int* semaphore_;
|
||||
};
|
||||
|
||||
struct GpuDevice {
|
||||
// The StreamInterface is not owned: the caller is
|
||||
// responsible for its initialization and eventual destruction.
|
||||
explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
|
||||
eigen_assert(stream);
|
||||
}
|
||||
explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
|
||||
eigen_assert(stream);
|
||||
}
|
||||
// TODO(bsteiner): This is an internal API, we should not expose it.
|
||||
EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
|
||||
return stream_->stream();
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
|
||||
return stream_->allocate(num_bytes);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
|
||||
stream_->deallocate(buffer);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void* scratchpad() const {
|
||||
return stream_->scratchpad();
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE unsigned int* semaphore() const {
|
||||
return stream_->semaphore();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
|
||||
stream_->stream());
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
|
||||
cudaError_t err =
|
||||
cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
|
||||
cudaError_t err =
|
||||
cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE size_t numThreads() const {
|
||||
// FIXME
|
||||
return 32;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
||||
// FIXME
|
||||
return 48*1024;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
|
||||
// We won't try to take advantage of the l2 cache for the time being, and
|
||||
// there is no l3 cache on cuda devices.
|
||||
return firstLevelCacheSize();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
|
||||
#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
|
||||
cudaError_t err = cudaStreamSynchronize(stream_->stream());
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Error detected in CUDA stream: "
|
||||
<< cudaGetErrorString(err)
|
||||
<< std::endl;
|
||||
assert(err == cudaSuccess);
|
||||
}
|
||||
#else
|
||||
assert(false && "The default device should be used instead to generate kernel code");
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
|
||||
return stream_->deviceProperties().multiProcessorCount;
|
||||
}
|
||||
EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
|
||||
return stream_->deviceProperties().maxThreadsPerBlock;
|
||||
}
|
||||
EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
|
||||
return stream_->deviceProperties().maxThreadsPerMultiProcessor;
|
||||
}
|
||||
EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
|
||||
return stream_->deviceProperties().sharedMemPerBlock;
|
||||
}
|
||||
EIGEN_STRONG_INLINE int majorDeviceVersion() const {
|
||||
return stream_->deviceProperties().major;
|
||||
}
|
||||
EIGEN_STRONG_INLINE int minorDeviceVersion() const {
|
||||
return stream_->deviceProperties().minor;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE int maxBlocks() const {
|
||||
return max_blocks_;
|
||||
}
|
||||
|
||||
// This function checks if the CUDA runtime recorded an error for the
|
||||
// underlying stream device.
|
||||
inline bool ok() const {
|
||||
#ifdef __CUDACC__
|
||||
cudaError_t error = cudaStreamQuery(stream_->stream());
|
||||
return (error == cudaSuccess) || (error == cudaErrorNotReady);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
const StreamInterface* stream_;
|
||||
int max_blocks_;
|
||||
};
|
||||
|
||||
#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
|
||||
(kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
|
||||
assert(cudaGetLastError() == cudaSuccess);
|
||||
|
||||
|
||||
// FIXME: Should be device and kernel specific.
|
||||
#ifdef __CUDACC__
|
||||
static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
|
||||
#ifndef __CUDA_ARCH__
|
||||
cudaError_t status = cudaDeviceSetSharedMemConfig(config);
|
||||
EIGEN_UNUSED_VARIABLE(status)
|
||||
assert(status == cudaSuccess);
|
||||
#else
|
||||
EIGEN_UNUSED_VARIABLE(config)
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
|
|
@ -0,0 +1,81 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
|
||||
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// Default device for the machine (typically a single cpu core)
|
||||
struct DefaultDevice {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
|
||||
return internal::aligned_malloc(num_bytes);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
|
||||
internal::aligned_free(buffer);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
|
||||
::memcpy(dst, src, n);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
|
||||
memcpy(dst, src, n);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
|
||||
memcpy(dst, src, n);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
|
||||
::memset(buffer, c, n);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
// Running on the host CPU
|
||||
return 1;
|
||||
#else
|
||||
// Running on a CUDA device
|
||||
return 32;
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
// Running on the host CPU
|
||||
return l1CacheSize();
|
||||
#else
|
||||
// Running on a CUDA device, return the amount of shared memory available.
|
||||
return 48*1024;
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
// Running single threaded on the host CPU
|
||||
return l3CacheSize();
|
||||
#else
|
||||
// Running on a CUDA device
|
||||
return firstLevelCacheSize();
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
// Running single threaded on the host CPU
|
||||
// Should return an enum that encodes the ISA supported by the CPU
|
||||
return 1;
|
||||
#else
|
||||
// Running on a CUDA device
|
||||
return __CUDA_ARCH__ / 100;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
|
|
@ -0,0 +1,122 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
|
||||
|
||||
namespace Eigen {
|
||||
struct SyclDevice {
|
||||
/// class members
|
||||
/// sycl queue
|
||||
mutable cl::sycl::queue m_queue;
|
||||
/// std::map is the container used to make sure that we create only one buffer
|
||||
/// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice.
|
||||
/// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it.
|
||||
mutable std::map<const void *, std::shared_ptr<void>> buffer_map;
|
||||
/// creating device by using selector
|
||||
template<typename dev_Selector> SyclDevice(dev_Selector s)
|
||||
:
|
||||
#ifdef EIGEN_EXCEPTIONS
|
||||
m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) {
|
||||
for (const auto& e : l) {
|
||||
try {
|
||||
std::rethrow_exception(e);
|
||||
} catch (cl::sycl::exception e) {
|
||||
std::cout << e.what() << std::endl;
|
||||
}
|
||||
}
|
||||
}))
|
||||
#else
|
||||
m_queue(cl::sycl::queue(s))
|
||||
#endif
|
||||
{}
|
||||
// destructor
|
||||
~SyclDevice() { deallocate_all(); }
|
||||
|
||||
template <typename T> void deallocate(T *p) const {
|
||||
auto it = buffer_map.find(p);
|
||||
if (it != buffer_map.end()) {
|
||||
buffer_map.erase(it);
|
||||
internal::aligned_free(p);
|
||||
}
|
||||
}
|
||||
void deallocate_all() const {
|
||||
std::map<const void *, std::shared_ptr<void>>::iterator it=buffer_map.begin();
|
||||
while (it!=buffer_map.end()) {
|
||||
auto p=it->first;
|
||||
buffer_map.erase(it);
|
||||
internal::aligned_free(const_cast<void*>(p));
|
||||
it=buffer_map.begin();
|
||||
}
|
||||
buffer_map.clear();
|
||||
}
|
||||
|
||||
/// creation of sycl accessor for a buffer. This function first tries to find
|
||||
/// the buffer in the buffer_map. If found it gets the accessor from it, if not,
|
||||
///the function then adds an entry by creating a sycl buffer for that particular pointer.
|
||||
template <cl::sycl::access::mode AcMd, typename T> inline cl::sycl::accessor<T, 1, AcMd, cl::sycl::access::target::global_buffer>
|
||||
get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const {
|
||||
return (get_sycl_buffer<T>(num_bytes, ptr)->template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
|
||||
}
|
||||
|
||||
template<typename T> inline std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const {
|
||||
using Type = cl::sycl::buffer<T, 1>;
|
||||
std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> ret = buffer_map.insert(std::pair<const void *, std::shared_ptr<void>>(ptr, std::shared_ptr<void>(new Type(cl::sycl::range<1>(num_bytes)),
|
||||
[](void *dataMem) { delete static_cast<Type*>(dataMem); })));
|
||||
(static_cast<Type*>(buffer_map.at(ptr).get()))->set_final_data(nullptr);
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T> inline cl::sycl::buffer<T, 1>* get_sycl_buffer(size_t num_bytes,const T * ptr) const {
|
||||
return static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(ptr, num_bytes).first->second.get());
|
||||
}
|
||||
|
||||
/// allocating memory on the cpu
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const {
|
||||
return internal::aligned_malloc(8);
|
||||
}
|
||||
|
||||
// some runtime conditions that can be applied here
|
||||
bool isDeviceSuitable() const { return true; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
|
||||
::memcpy(dst, src, n);
|
||||
}
|
||||
|
||||
template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const {
|
||||
auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
|
||||
memcpy(host_acc.get_pointer(), src, n);
|
||||
}
|
||||
/// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon.
|
||||
template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const {
|
||||
auto it = buffer_map.find(src);
|
||||
if (it != buffer_map.end()) {
|
||||
auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(it->second.get()))-> template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::host_buffer>();
|
||||
memcpy(dst,host_acc.get_pointer(), n);
|
||||
} else{
|
||||
eigen_assert("no device memory found. The memory might be destroyed before creation");
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const {
|
||||
::memset(buffer, c, n);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
|
||||
return 1;
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
|
|
@ -0,0 +1,282 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H)
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// Use the SimpleThreadPool by default. We'll switch to the new non blocking
|
||||
// thread pool later.
|
||||
#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
|
||||
template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
|
||||
typedef NonBlockingThreadPool ThreadPool;
|
||||
#else
|
||||
template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
|
||||
typedef SimpleThreadPool ThreadPool;
|
||||
#endif
|
||||
|
||||
|
||||
// Barrier is an object that allows one or more threads to wait until
|
||||
// Notify has been called a specified number of times.
|
||||
class Barrier {
|
||||
public:
|
||||
Barrier(unsigned int count) : state_(count << 1), notified_(false) {
|
||||
eigen_assert(((count << 1) >> 1) == count);
|
||||
}
|
||||
~Barrier() {
|
||||
eigen_plain_assert((state_>>1) == 0);
|
||||
}
|
||||
|
||||
void Notify() {
|
||||
unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
|
||||
if (v != 1) {
|
||||
eigen_assert(((v + 2) & ~1) != 0);
|
||||
return; // either count has not dropped to 0, or waiter is not waiting
|
||||
}
|
||||
std::unique_lock<std::mutex> l(mu_);
|
||||
eigen_assert(!notified_);
|
||||
notified_ = true;
|
||||
cv_.notify_all();
|
||||
}
|
||||
|
||||
void Wait() {
|
||||
unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
|
||||
if ((v >> 1) == 0) return;
|
||||
std::unique_lock<std::mutex> l(mu_);
|
||||
while (!notified_) {
|
||||
cv_.wait(l);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::mutex mu_;
|
||||
std::condition_variable cv_;
|
||||
std::atomic<unsigned int> state_; // low bit is waiter flag
|
||||
bool notified_;
|
||||
};
|
||||
|
||||
|
||||
// Notification is an object that allows a user to to wait for another
|
||||
// thread to signal a notification that an event has occurred.
|
||||
//
|
||||
// Multiple threads can wait on the same Notification object,
|
||||
// but only one caller must call Notify() on the object.
|
||||
struct Notification : Barrier {
|
||||
Notification() : Barrier(1) {};
|
||||
};
|
||||
|
||||
|
||||
// Runs an arbitrary function and then calls Notify() on the passed in
|
||||
// Notification.
|
||||
template <typename Function, typename... Args> struct FunctionWrapperWithNotification
|
||||
{
|
||||
static void run(Notification* n, Function f, Args... args) {
|
||||
f(args...);
|
||||
if (n) {
|
||||
n->Notify();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Function, typename... Args> struct FunctionWrapperWithBarrier
|
||||
{
|
||||
static void run(Barrier* b, Function f, Args... args) {
|
||||
f(args...);
|
||||
if (b) {
|
||||
b->Notify();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename SyncType>
|
||||
static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
|
||||
if (n) {
|
||||
n->Wait();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Build a thread pool device on top the an existing pool of threads.
|
||||
struct ThreadPoolDevice {
|
||||
// The ownership of the thread pool remains with the caller.
|
||||
ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }
|
||||
|
||||
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
|
||||
return internal::aligned_malloc(num_bytes);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
|
||||
internal::aligned_free(buffer);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
|
||||
::memcpy(dst, src, n);
|
||||
}
|
||||
EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
|
||||
memcpy(dst, src, n);
|
||||
}
|
||||
EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
|
||||
memcpy(dst, src, n);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
|
||||
::memset(buffer, c, n);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE int numThreads() const {
|
||||
return num_threads_;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
||||
return l1CacheSize();
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
|
||||
// The l3 cache size is shared between all the cores.
|
||||
return l3CacheSize() / num_threads_;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
|
||||
// Should return an enum that encodes the ISA supported by the CPU
|
||||
return 1;
|
||||
}
|
||||
|
||||
template <class Function, class... Args>
|
||||
EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
|
||||
Notification* n = new Notification();
|
||||
pool_->Schedule(std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...));
|
||||
return n;
|
||||
}
|
||||
|
||||
template <class Function, class... Args>
|
||||
EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
|
||||
Function&& f,
|
||||
Args&&... args) const {
|
||||
pool_->Schedule(std::bind(
|
||||
&FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...));
|
||||
}
|
||||
|
||||
template <class Function, class... Args>
|
||||
EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
|
||||
pool_->Schedule(std::bind(f, args...));
|
||||
}
|
||||
|
||||
// Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
|
||||
// called from one of the threads in pool_. Returns -1 otherwise.
|
||||
EIGEN_STRONG_INLINE int currentThreadId() const {
|
||||
return pool_->CurrentThreadId();
|
||||
}
|
||||
|
||||
// parallelFor executes f with [0, n) arguments in parallel and waits for
|
||||
// completion. F accepts a half-open interval [first, last).
|
||||
// Block size is choosen based on the iteration cost and resulting parallel
|
||||
// efficiency. If block_align is not nullptr, it is called to round up the
|
||||
// block size.
|
||||
void parallelFor(Index n, const TensorOpCost& cost,
|
||||
std::function<Index(Index)> block_align,
|
||||
std::function<void(Index, Index)> f) const {
|
||||
typedef TensorCostModel<ThreadPoolDevice> CostModel;
|
||||
if (n <= 1 || numThreads() == 1 ||
|
||||
CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
|
||||
f(0, n);
|
||||
return;
|
||||
}
|
||||
|
||||
// Calculate block size based on (1) the iteration cost and (2) parallel
|
||||
// efficiency. We want blocks to be not too small to mitigate
|
||||
// parallelization overheads; not too large to mitigate tail
|
||||
// effect and potential load imbalance and we also want number
|
||||
// of blocks to be evenly dividable across threads.
|
||||
|
||||
double block_size_f = 1.0 / CostModel::taskSize(1, cost);
|
||||
const Index max_oversharding_factor = 4;
|
||||
Index block_size = numext::mini(
|
||||
n, numext::maxi<Index>(divup<Index>(n, max_oversharding_factor * numThreads()),
|
||||
block_size_f));
|
||||
const Index max_block_size = numext::mini(n, 2 * block_size);
|
||||
if (block_align) {
|
||||
Index new_block_size = block_align(block_size);
|
||||
eigen_assert(new_block_size >= block_size);
|
||||
block_size = numext::mini(n, new_block_size);
|
||||
}
|
||||
Index block_count = divup(n, block_size);
|
||||
// Calculate parallel efficiency as fraction of total CPU time used for
|
||||
// computations:
|
||||
double max_efficiency =
|
||||
static_cast<double>(block_count) /
|
||||
(divup<int>(block_count, numThreads()) * numThreads());
|
||||
// Now try to increase block size up to max_block_size as long as it
|
||||
// doesn't decrease parallel efficiency.
|
||||
for (Index prev_block_count = block_count;
|
||||
max_efficiency < 1.0 && prev_block_count > 1;) {
|
||||
// This is the next block size that divides size into a smaller number
|
||||
// of blocks than the current block_size.
|
||||
Index coarser_block_size = divup(n, prev_block_count - 1);
|
||||
if (block_align) {
|
||||
Index new_block_size = block_align(coarser_block_size);
|
||||
eigen_assert(new_block_size >= coarser_block_size);
|
||||
coarser_block_size = numext::mini(n, new_block_size);
|
||||
}
|
||||
if (coarser_block_size > max_block_size) {
|
||||
break; // Reached max block size. Stop.
|
||||
}
|
||||
// Recalculate parallel efficiency.
|
||||
const Index coarser_block_count = divup(n, coarser_block_size);
|
||||
eigen_assert(coarser_block_count < prev_block_count);
|
||||
prev_block_count = coarser_block_count;
|
||||
const double coarser_efficiency =
|
||||
static_cast<double>(coarser_block_count) /
|
||||
(divup<int>(coarser_block_count, numThreads()) * numThreads());
|
||||
if (coarser_efficiency + 0.01 >= max_efficiency) {
|
||||
// Taking it.
|
||||
block_size = coarser_block_size;
|
||||
block_count = coarser_block_count;
|
||||
if (max_efficiency < coarser_efficiency) {
|
||||
max_efficiency = coarser_efficiency;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Recursively divide size into halves until we reach block_size.
|
||||
// Division code rounds mid to block_size, so we are guaranteed to get
|
||||
// block_count leaves that do actual computations.
|
||||
Barrier barrier(static_cast<unsigned int>(block_count));
|
||||
std::function<void(Index, Index)> handleRange;
|
||||
handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
|
||||
if (last - first <= block_size) {
|
||||
// Single block or less, execute directly.
|
||||
f(first, last);
|
||||
barrier.Notify();
|
||||
return;
|
||||
}
|
||||
// Split into halves and submit to the pool.
|
||||
Index mid = first + divup((last - first) / 2, block_size) * block_size;
|
||||
pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
|
||||
pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
|
||||
};
|
||||
handleRange(0, n);
|
||||
barrier.Wait();
|
||||
}
|
||||
|
||||
// Convenience wrapper for parallelFor that does not align blocks.
|
||||
void parallelFor(Index n, const TensorOpCost& cost,
|
||||
std::function<void(Index, Index)> f) const {
|
||||
parallelFor(n, cost, nullptr, std::move(f));
|
||||
}
|
||||
|
||||
private:
|
||||
ThreadPoolInterface* pool_;
|
||||
int num_threads_;
|
||||
};
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
|
|
@ -0,0 +1,236 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class TensorDimensionList
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n.
|
||||
*
|
||||
* \sa Tensor
|
||||
*/
|
||||
|
||||
template <typename Index, std::size_t Rank> struct DimensionList {
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
const Index operator[] (const Index i) const { return i; }
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename Index, std::size_t Rank> struct array_size<DimensionList<Index, Rank> > {
|
||||
static const size_t value = Rank;
|
||||
};
|
||||
template<typename Index, std::size_t Rank> struct array_size<const DimensionList<Index, Rank> > {
|
||||
static const size_t value = Rank;
|
||||
};
|
||||
|
||||
template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(DimensionList<Index, Rank>&) {
|
||||
return n;
|
||||
}
|
||||
template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(const DimensionList<Index, Rank>&) {
|
||||
return n;
|
||||
}
|
||||
|
||||
|
||||
#if EIGEN_HAS_CONSTEXPR
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_known_statically_impl<DimensionList<Index, Rank> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_known_statically_impl<const DimensionList<Index, Rank> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct all_indices_known_statically_impl<DimensionList<Index, Rank> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run() {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run() {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run() {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run() {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_statically_eq_impl<DimensionList<Index, Rank> > {
|
||||
static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return i == value;
|
||||
}
|
||||
};
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_statically_eq_impl<const DimensionList<Index, Rank> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return i == value;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_statically_ne_impl<DimensionList<Index, Rank> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return i != value;
|
||||
}
|
||||
};
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_statically_ne_impl<const DimensionList<Index, Rank> > {
|
||||
static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return i != value;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_statically_gt_impl<DimensionList<Index, Rank> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return i > value;
|
||||
}
|
||||
};
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_statically_gt_impl<const DimensionList<Index, Rank> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return i > value;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_statically_lt_impl<DimensionList<Index, Rank> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return i < value;
|
||||
}
|
||||
};
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_statically_lt_impl<const DimensionList<Index, Rank> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return i < value;
|
||||
}
|
||||
};
|
||||
|
||||
#else
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_known_statically_impl<DimensionList<Index, Rank> > {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_known_statically_impl<const DimensionList<Index, Rank> > {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct all_indices_known_statically_impl<DimensionList<Index, Rank> > {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_statically_eq_impl<DimensionList<Index, Rank> > {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_statically_eq_impl<const DimensionList<Index, Rank> > {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_statically_ne_impl<DimensionList<Index, Rank> > {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex){
|
||||
return false;
|
||||
}
|
||||
};
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_statically_ne_impl<const DimensionList<Index, Rank> > {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_statically_gt_impl<DimensionList<Index, Rank> > {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_statically_gt_impl<const DimensionList<Index, Rank> > {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_statically_lt_impl<DimensionList<Index, Rank> > {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
template <typename Index, std::size_t Rank>
|
||||
struct index_statically_lt_impl<const DimensionList<Index, Rank> > {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
|
|
@ -0,0 +1,428 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
|
||||
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class TensorDimensions
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Set of classes used to encode and store the dimensions of a Tensor.
|
||||
*
|
||||
* The Sizes class encodes as part of the type the number of dimensions and the
|
||||
* sizes corresponding to each dimension. It uses no storage space since it is
|
||||
* entirely known at compile time.
|
||||
* The DSizes class is its dynamic sibling: the number of dimensions is known
|
||||
* at compile time but the sizes are set during execution.
|
||||
*
|
||||
* \sa Tensor
|
||||
*/
|
||||
|
||||
// Boilerplate code
|
||||
namespace internal {
|
||||
|
||||
template<std::size_t n, typename Dimension> struct dget {
|
||||
static const std::size_t value = get<n, Dimension>::value;
|
||||
};
|
||||
|
||||
|
||||
template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
|
||||
struct fixed_size_tensor_index_linearization_helper
|
||||
{
|
||||
template <typename Dimensions> EIGEN_DEVICE_FUNC
|
||||
static inline Index run(array<Index, NumIndices> const& indices,
|
||||
const Dimensions& dimensions)
|
||||
{
|
||||
return array_get<RowMajor ? n - 1 : (NumIndices - n)>(indices) +
|
||||
dget<RowMajor ? n - 1 : (NumIndices - n), Dimensions>::value *
|
||||
fixed_size_tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Index, std::size_t NumIndices, bool RowMajor>
|
||||
struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
|
||||
{
|
||||
template <typename Dimensions> EIGEN_DEVICE_FUNC
|
||||
static inline Index run(array<Index, NumIndices> const&, const Dimensions&)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Index, std::size_t n>
|
||||
struct fixed_size_tensor_index_extraction_helper
|
||||
{
|
||||
template <typename Dimensions> EIGEN_DEVICE_FUNC
|
||||
static inline Index run(const Index index,
|
||||
const Dimensions& dimensions)
|
||||
{
|
||||
const Index mult = (index == n-1) ? 1 : 0;
|
||||
return array_get<n-1>(dimensions) * mult +
|
||||
fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Index>
|
||||
struct fixed_size_tensor_index_extraction_helper<Index, 0>
|
||||
{
|
||||
template <typename Dimensions> EIGEN_DEVICE_FUNC
|
||||
static inline Index run(const Index,
|
||||
const Dimensions&)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
// Fixed size
|
||||
#ifndef EIGEN_EMULATE_CXX11_META_H
|
||||
template <typename std::ptrdiff_t... Indices>
|
||||
struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
|
||||
typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
|
||||
static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
|
||||
return Base::count;
|
||||
}
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() {
|
||||
return internal::arg_prod(Indices...);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Sizes() { }
|
||||
template <typename DenseIndex>
|
||||
explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
|
||||
// todo: add assertion
|
||||
}
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template <typename... DenseIndex> EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { }
|
||||
explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
|
||||
// todo: add assertion
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T> Sizes& operator = (const T& /*other*/) {
|
||||
// add assertion failure if the size of other is different
|
||||
return *this;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const {
|
||||
return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, *this);
|
||||
}
|
||||
|
||||
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
|
||||
return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *static_cast<const Base*>(this));
|
||||
}
|
||||
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
|
||||
return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *static_cast<const Base*>(this));
|
||||
}
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
template <typename std::ptrdiff_t... Indices>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<Indices...>&) {
|
||||
return Sizes<Indices...>::total_size;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
template <std::size_t n>
|
||||
struct non_zero_size {
|
||||
typedef internal::type2val<std::size_t, n> type;
|
||||
};
|
||||
template <>
|
||||
struct non_zero_size<0> {
|
||||
typedef internal::null_type type;
|
||||
};
|
||||
|
||||
template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0, std::size_t V5=0> struct Sizes {
|
||||
typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base;
|
||||
static const size_t count = Base::count;
|
||||
static const std::size_t total_size = internal::arg_prod<Base>::value;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
|
||||
return count;
|
||||
}
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() {
|
||||
return internal::arg_prod<Base>::value;
|
||||
}
|
||||
|
||||
Sizes() { }
|
||||
template <typename DenseIndex>
|
||||
explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
|
||||
// todo: add assertion
|
||||
}
|
||||
template <typename T> Sizes& operator = (const T& /*other*/) {
|
||||
// add assertion failure if the size of other is different
|
||||
return *this;
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { }
|
||||
explicit Sizes(std::initializer_list<std::size_t>) {
|
||||
// todo: add assertion
|
||||
}
|
||||
#else
|
||||
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) {
|
||||
}
|
||||
EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) {
|
||||
}
|
||||
EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
|
||||
}
|
||||
EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
|
||||
}
|
||||
EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index operator[] (const Index index) const {
|
||||
switch (index) {
|
||||
case 0:
|
||||
return internal::get<0, Base>::value;
|
||||
case 1:
|
||||
return internal::get<1, Base>::value;
|
||||
case 2:
|
||||
return internal::get<2, Base>::value;
|
||||
case 3:
|
||||
return internal::get<3, Base>::value;
|
||||
case 4:
|
||||
return internal::get<4, Base>::value;
|
||||
default:
|
||||
eigen_assert(false && "index overflow");
|
||||
return static_cast<Index>(-1);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
|
||||
return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *reinterpret_cast<const Base*>(this));
|
||||
}
|
||||
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
|
||||
return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *reinterpret_cast<const Base*>(this));
|
||||
}
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
|
||||
return Sizes<V1, V2, V3, V4, V5>::total_size;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Boilerplate
|
||||
namespace internal {
|
||||
template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
|
||||
struct tensor_index_linearization_helper
|
||||
{
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const& dimensions)
|
||||
{
|
||||
return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
|
||||
array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
|
||||
tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Index, std::size_t NumIndices, bool RowMajor>
|
||||
struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
|
||||
{
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const&)
|
||||
{
|
||||
return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
|
||||
}
|
||||
};
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
// Dynamic size
|
||||
template <typename DenseIndex, int NumDims>
|
||||
struct DSizes : array<DenseIndex, NumDims> {
|
||||
typedef array<DenseIndex, NumDims> Base;
|
||||
static const int count = NumDims;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
|
||||
return NumDims;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const {
|
||||
return (NumDims == 0) ? 1 : internal::array_prod(*static_cast<const Base*>(this));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() {
|
||||
for (int i = 0 ; i < NumDims; ++i) {
|
||||
(*this)[i] = 0;
|
||||
}
|
||||
}
|
||||
EIGEN_DEVICE_FUNC explicit DSizes(const array<DenseIndex, NumDims>& a) : Base(a) { }
|
||||
|
||||
EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) {
|
||||
eigen_assert(NumDims == 1);
|
||||
(*this)[0] = i0;
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
|
||||
EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
}
|
||||
#else
|
||||
EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) {
|
||||
eigen_assert(NumDims == 2);
|
||||
(*this)[0] = i0;
|
||||
(*this)[1] = i1;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
|
||||
eigen_assert(NumDims == 3);
|
||||
(*this)[0] = i0;
|
||||
(*this)[1] = i1;
|
||||
(*this)[2] = i2;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
|
||||
eigen_assert(NumDims == 4);
|
||||
(*this)[0] = i0;
|
||||
(*this)[1] = i1;
|
||||
(*this)[2] = i2;
|
||||
(*this)[3] = i3;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
|
||||
eigen_assert(NumDims == 5);
|
||||
(*this)[0] = i0;
|
||||
(*this)[1] = i1;
|
||||
(*this)[2] = i2;
|
||||
(*this)[3] = i3;
|
||||
(*this)[4] = i4;
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC DSizes& operator = (const array<DenseIndex, NumDims>& other) {
|
||||
*static_cast<Base*>(this) = other;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// A constexpr would be so much better here
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
|
||||
return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
|
||||
return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
// Boilerplate
|
||||
namespace internal {
|
||||
template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
|
||||
struct tensor_vsize_index_linearization_helper
|
||||
{
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const& dimensions)
|
||||
{
|
||||
return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
|
||||
array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
|
||||
tensor_vsize_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Index, std::size_t NumIndices, bool RowMajor>
|
||||
struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
|
||||
{
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const&)
|
||||
{
|
||||
return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
|
||||
}
|
||||
};
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <typename DenseIndex, int NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > {
|
||||
static const size_t value = NumDims;
|
||||
};
|
||||
template <typename DenseIndex, int NumDims> struct array_size<DSizes<DenseIndex, NumDims> > {
|
||||
static const size_t value = NumDims;
|
||||
};
|
||||
#ifndef EIGEN_EMULATE_CXX11_META_H
|
||||
template <typename std::ptrdiff_t... Indices> struct array_size<const Sizes<Indices...> > {
|
||||
static const std::ptrdiff_t value = Sizes<Indices...>::count;
|
||||
};
|
||||
template <typename std::ptrdiff_t... Indices> struct array_size<Sizes<Indices...> > {
|
||||
static const std::ptrdiff_t value = Sizes<Indices...>::count;
|
||||
};
|
||||
template <std::ptrdiff_t n, typename std::ptrdiff_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) {
|
||||
return get<n, internal::numeric_list<std::size_t, Indices...> >::value;
|
||||
}
|
||||
template <std::ptrdiff_t n> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) {
|
||||
eigen_assert(false && "should never be called");
|
||||
return -1;
|
||||
}
|
||||
#else
|
||||
template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
|
||||
static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
|
||||
};
|
||||
template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
|
||||
static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
|
||||
};
|
||||
template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
|
||||
return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
template <typename Dims1, typename Dims2, size_t n, size_t m>
|
||||
struct sizes_match_below_dim {
|
||||
static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
template <typename Dims1, typename Dims2, size_t n>
|
||||
struct sizes_match_below_dim<Dims1, Dims2, n, n> {
|
||||
static EIGEN_DEVICE_FUNC inline bool run(Dims1& dims1, Dims2& dims2) {
|
||||
return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &
|
||||
sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
|
||||
}
|
||||
};
|
||||
template <typename Dims1, typename Dims2>
|
||||
struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
|
||||
static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
template <typename Dims1, typename Dims2>
|
||||
EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) {
|
||||
return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
|
||||
}
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
|
|
@ -0,0 +1,181 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorForcedEval
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor reshaping class.
|
||||
*
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename XprType, template <class> class MakePointer_>
|
||||
struct traits<TensorEvalToOp<XprType, MakePointer_> >
|
||||
{
|
||||
// Type promotion to handle the case where the types of the lhs and the rhs are different.
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
|
||||
enum {
|
||||
Flags = 0
|
||||
};
|
||||
template <class T>
|
||||
struct MakePointer {
|
||||
// Intermediate typedef to workaround MSVC issue.
|
||||
typedef MakePointer_<T> MakePointerT;
|
||||
typedef typename MakePointerT::Type Type;
|
||||
};
|
||||
};
|
||||
|
||||
template<typename XprType, template <class> class MakePointer_>
|
||||
struct eval<TensorEvalToOp<XprType, MakePointer_>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorEvalToOp<XprType, MakePointer_>& type;
|
||||
};
|
||||
|
||||
template<typename XprType, template <class> class MakePointer_>
|
||||
struct nested<TensorEvalToOp<XprType, MakePointer_>, 1, typename eval<TensorEvalToOp<XprType, MakePointer_> >::type>
|
||||
{
|
||||
typedef TensorEvalToOp<XprType, MakePointer_> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
|
||||
template<typename XprType, template <class> class MakePointer_>
|
||||
class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType, MakePointer_>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename MakePointer_<CoeffReturnType>::Type PointerType;
|
||||
typedef typename Eigen::internal::nested<TensorEvalToOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr)
|
||||
: m_xpr(expr), m_buffer(buffer) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
PointerType m_buffer;
|
||||
};
|
||||
|
||||
|
||||
|
||||
template<typename ArgType, typename Device, template <class> class MakePointer_>
|
||||
struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
||||
{
|
||||
typedef TensorEvalToOp<ArgType, MakePointer_> XprType;
|
||||
typedef typename ArgType::Scalar Scalar;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_device(device),
|
||||
m_buffer(op.buffer()), m_op(op), m_expression(op.expression())
|
||||
{ }
|
||||
|
||||
// Used for accessor extraction in SYCL Managed TensorMap:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const {
|
||||
return m_op;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
|
||||
}
|
||||
|
||||
typedef typename internal::traits<const TensorEvalToOp<ArgType, MakePointer_> >::template MakePointer<CoeffReturnType>::Type DevicePointer;
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) {
|
||||
EIGEN_UNUSED_VARIABLE(scalar);
|
||||
eigen_assert(scalar == NULL);
|
||||
return m_impl.evalSubExprsIfNeeded(m_buffer);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
|
||||
m_buffer[i] = m_impl.coeff(i);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
|
||||
internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_buffer[index];
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
// We assume that evalPacket or evalScalar is called to perform the
|
||||
// assignment and account for the cost of the write here.
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; }
|
||||
ArgType expression() const { return m_expression; }
|
||||
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
/// added for sycl in order to construct the buffer from the sycl device
|
||||
const Device& device() const{return m_device;}
|
||||
|
||||
private:
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
const Device& m_device;
|
||||
DevicePointer m_buffer;
|
||||
const XprType& m_op;
|
||||
const ArgType m_expression;
|
||||
};
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
|
|
@ -0,0 +1,633 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorEvaluator
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief The tensor evaluator classes.
|
||||
*
|
||||
* These classes are responsible for the evaluation of the tensor expression.
|
||||
*
|
||||
* TODO: add support for more types of expressions, in particular expressions
|
||||
* leading to lvalues (slicing, reshaping, etc...)
|
||||
*/
|
||||
|
||||
// Generic evaluator
|
||||
template<typename Derived, typename Device>
|
||||
struct TensorEvaluator
|
||||
{
|
||||
typedef typename Derived::Index Index;
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename Derived::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef typename Derived::Dimensions Dimensions;
|
||||
|
||||
// NumDimensions is -1 for variable dim tensors
|
||||
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
|
||||
internal::traits<Derived>::NumDimensions : 0;
|
||||
|
||||
enum {
|
||||
IsAligned = Derived::IsAligned,
|
||||
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
|
||||
Layout = Derived::Layout,
|
||||
CoordAccess = NumCoords > 0,
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
|
||||
: m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m)
|
||||
{ }
|
||||
|
||||
// Used for accessor extraction in SYCL Managed TensorMap:
|
||||
const Derived& derived() const { return m_impl; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) {
|
||||
if (dest) {
|
||||
m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize());
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
|
||||
eigen_assert(m_data);
|
||||
return m_data[index];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
|
||||
eigen_assert(m_data);
|
||||
return m_data[index];
|
||||
}
|
||||
|
||||
template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
PacketReturnType packet(Index index) const
|
||||
{
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
|
||||
}
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
{
|
||||
return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
|
||||
eigen_assert(m_data);
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
return m_data[m_dims.IndexOfColMajor(coords)];
|
||||
} else {
|
||||
return m_data[m_dims.IndexOfRowMajor(coords)];
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) {
|
||||
eigen_assert(m_data);
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
return m_data[m_dims.IndexOfColMajor(coords)];
|
||||
} else {
|
||||
return m_data[m_dims.IndexOfRowMajor(coords)];
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
|
||||
internal::unpacket_traits<PacketReturnType>::size);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; }
|
||||
|
||||
/// required by sycl in order to construct sycl buffer from raw pointer
|
||||
const Device& device() const{return m_device;}
|
||||
|
||||
protected:
|
||||
typename internal::traits<Derived>::template MakePointer<Scalar>::Type m_data;
|
||||
Dimensions m_dims;
|
||||
const Device& m_device;
|
||||
const Derived& m_impl;
|
||||
};
|
||||
|
||||
namespace {
|
||||
template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T loadConstant(const T* address) {
|
||||
return *address;
|
||||
}
|
||||
// Use the texture cache on CUDA devices whenever possible
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float loadConstant(const float* address) {
|
||||
return __ldg(address);
|
||||
}
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
double loadConstant(const double* address) {
|
||||
return __ldg(address);
|
||||
}
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
Eigen::half loadConstant(const Eigen::half* address) {
|
||||
return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// Default evaluator for rvalues
|
||||
template<typename Derived, typename Device>
|
||||
struct TensorEvaluator<const Derived, Device>
|
||||
{
|
||||
typedef typename Derived::Index Index;
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename Derived::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef typename Derived::Dimensions Dimensions;
|
||||
|
||||
// NumDimensions is -1 for variable dim tensors
|
||||
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
|
||||
internal::traits<Derived>::NumDimensions : 0;
|
||||
|
||||
enum {
|
||||
IsAligned = Derived::IsAligned,
|
||||
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
|
||||
Layout = Derived::Layout,
|
||||
CoordAccess = NumCoords > 0,
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
// Used for accessor extraction in SYCL Managed TensorMap:
|
||||
const Derived& derived() const { return m_impl; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
|
||||
: m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m)
|
||||
{ }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
|
||||
if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data) {
|
||||
m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
|
||||
eigen_assert(m_data);
|
||||
return loadConstant(m_data+index);
|
||||
}
|
||||
|
||||
template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
PacketReturnType packet(Index index) const
|
||||
{
|
||||
return internal::ploadt_ro<PacketReturnType, LoadMode>(m_data + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
|
||||
eigen_assert(m_data);
|
||||
const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
|
||||
: m_dims.IndexOfRowMajor(coords);
|
||||
return loadConstant(m_data+index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
|
||||
internal::unpacket_traits<PacketReturnType>::size);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; }
|
||||
|
||||
/// added for sycl in order to construct the buffer from the sycl device
|
||||
const Device& device() const{return m_device;}
|
||||
|
||||
protected:
|
||||
typename internal::traits<Derived>::template MakePointer<const Scalar>::Type m_data;
|
||||
Dimensions m_dims;
|
||||
const Device& m_device;
|
||||
const Derived& m_impl;
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
// -------------------- CwiseNullaryOp --------------------
|
||||
|
||||
template<typename NullaryOp, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
|
||||
{
|
||||
typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
|
||||
|
||||
enum {
|
||||
IsAligned = true,
|
||||
PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
|
||||
{ }
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_wrapper(m_functor, index);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
|
||||
internal::unpacket_traits<PacketReturnType>::size);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
|
||||
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
NullaryOp functor() const { return m_functor; }
|
||||
|
||||
|
||||
private:
|
||||
const NullaryOp m_functor;
|
||||
TensorEvaluator<ArgType, Device> m_argImpl;
|
||||
const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
|
||||
};
|
||||
|
||||
|
||||
|
||||
// -------------------- CwiseUnaryOp --------------------
|
||||
|
||||
template<typename UnaryOp, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
|
||||
{
|
||||
typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_functor(op.functor()),
|
||||
m_argImpl(op.nestedExpression(), device)
|
||||
{ }
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
|
||||
m_argImpl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_argImpl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_functor(m_argImpl.coeff(index));
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
|
||||
return m_argImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
|
||||
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<ArgType, Device> & impl() const { return m_argImpl; }
|
||||
/// added for sycl in order to construct the buffer from sycl device
|
||||
UnaryOp functor() const { return m_functor; }
|
||||
|
||||
|
||||
private:
|
||||
const UnaryOp m_functor;
|
||||
TensorEvaluator<ArgType, Device> m_argImpl;
|
||||
};
|
||||
|
||||
|
||||
// -------------------- CwiseBinaryOp --------------------
|
||||
|
||||
template<typename BinaryOp, typename LeftArgType, typename RightArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device>
|
||||
{
|
||||
typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess &
|
||||
internal::functor_traits<BinaryOp>::PacketAccess,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_functor(op.functor()),
|
||||
m_leftImpl(op.lhsExpression(), device),
|
||||
m_rightImpl(op.rhsExpression(), device)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
|
||||
}
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||
{
|
||||
// TODO: use right impl instead if right impl dimensions are known at compile time.
|
||||
return m_leftImpl.dimensions();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
|
||||
m_leftImpl.evalSubExprsIfNeeded(NULL);
|
||||
m_rightImpl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_leftImpl.cleanup();
|
||||
m_rightImpl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
|
||||
}
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
|
||||
return m_leftImpl.costPerCoeff(vectorized) +
|
||||
m_rightImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
BinaryOp functor() const { return m_functor; }
|
||||
|
||||
private:
|
||||
const BinaryOp m_functor;
|
||||
TensorEvaluator<LeftArgType, Device> m_leftImpl;
|
||||
TensorEvaluator<RightArgType, Device> m_rightImpl;
|
||||
};
|
||||
|
||||
// -------------------- CwiseTernaryOp --------------------
|
||||
|
||||
template<typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type, typename Device>
|
||||
struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type>, Device>
|
||||
{
|
||||
typedef TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type> XprType;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
|
||||
internal::functor_traits<TernaryOp>::PacketAccess,
|
||||
Layout = TensorEvaluator<Arg1Type, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_functor(op.functor()),
|
||||
m_arg1Impl(op.arg1Expression(), device),
|
||||
m_arg2Impl(op.arg2Expression(), device),
|
||||
m_arg3Impl(op.arg3Expression(), device)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
|
||||
EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
|
||||
typename internal::traits<Arg2Type>::StorageKind>::value),
|
||||
STORAGE_KIND_MUST_MATCH)
|
||||
EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
|
||||
typename internal::traits<Arg3Type>::StorageKind>::value),
|
||||
STORAGE_KIND_MUST_MATCH)
|
||||
EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
|
||||
typename internal::traits<Arg2Type>::Index>::value),
|
||||
STORAGE_INDEX_MUST_MATCH)
|
||||
EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
|
||||
typename internal::traits<Arg3Type>::Index>::value),
|
||||
STORAGE_INDEX_MUST_MATCH)
|
||||
|
||||
eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions()));
|
||||
}
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||
{
|
||||
// TODO: use arg2 or arg3 dimensions if they are known at compile time.
|
||||
return m_arg1Impl.dimensions();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
|
||||
m_arg1Impl.evalSubExprsIfNeeded(NULL);
|
||||
m_arg2Impl.evalSubExprsIfNeeded(NULL);
|
||||
m_arg3Impl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_arg1Impl.cleanup();
|
||||
m_arg2Impl.cleanup();
|
||||
m_arg3Impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
|
||||
}
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index),
|
||||
m_arg2Impl.template packet<LoadMode>(index),
|
||||
m_arg3Impl.template packet<LoadMode>(index));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
const double functor_cost = internal::functor_traits<TernaryOp>::Cost;
|
||||
return m_arg1Impl.costPerCoeff(vectorized) +
|
||||
m_arg2Impl.costPerCoeff(vectorized) +
|
||||
m_arg3Impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
|
||||
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<Arg1Type, Device> & arg1Impl() const { return m_arg1Impl; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<Arg2Type, Device>& arg2Impl() const { return m_arg2Impl; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<Arg3Type, Device>& arg3Impl() const { return m_arg3Impl; }
|
||||
|
||||
private:
|
||||
const TernaryOp m_functor;
|
||||
TensorEvaluator<Arg1Type, Device> m_arg1Impl;
|
||||
TensorEvaluator<Arg2Type, Device> m_arg2Impl;
|
||||
TensorEvaluator<Arg3Type, Device> m_arg3Impl;
|
||||
};
|
||||
|
||||
|
||||
// -------------------- SelectOp --------------------
|
||||
|
||||
template<typename IfArgType, typename ThenArgType, typename ElseArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device>
|
||||
{
|
||||
typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
|
||||
internal::packet_traits<Scalar>::HasBlend,
|
||||
Layout = TensorEvaluator<IfArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_condImpl(op.ifExpression(), device),
|
||||
m_thenImpl(op.thenExpression(), device),
|
||||
m_elseImpl(op.elseExpression(), device)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ThenArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ElseArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions()));
|
||||
eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions()));
|
||||
}
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||
{
|
||||
// TODO: use then or else impl instead if they happen to be known at compile time.
|
||||
return m_condImpl.dimensions();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
|
||||
m_condImpl.evalSubExprsIfNeeded(NULL);
|
||||
m_thenImpl.evalSubExprsIfNeeded(NULL);
|
||||
m_elseImpl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_condImpl.cleanup();
|
||||
m_thenImpl.cleanup();
|
||||
m_elseImpl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index);
|
||||
}
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
|
||||
{
|
||||
internal::Selector<PacketSize> select;
|
||||
for (Index i = 0; i < PacketSize; ++i) {
|
||||
select.select[i] = m_condImpl.coeff(index+i);
|
||||
}
|
||||
return internal::pblend(select,
|
||||
m_thenImpl.template packet<LoadMode>(index),
|
||||
m_elseImpl.template packet<LoadMode>(index));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
return m_condImpl.costPerCoeff(vectorized) +
|
||||
m_thenImpl.costPerCoeff(vectorized)
|
||||
.cwiseMax(m_elseImpl.costPerCoeff(vectorized));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<IfArgType, Device> & cond_impl() const { return m_condImpl; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<ThenArgType, Device>& then_impl() const { return m_thenImpl; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<ElseArgType, Device>& else_impl() const { return m_elseImpl; }
|
||||
|
||||
private:
|
||||
TensorEvaluator<IfArgType, Device> m_condImpl;
|
||||
TensorEvaluator<ThenArgType, Device> m_thenImpl;
|
||||
TensorEvaluator<ElseArgType, Device> m_elseImpl;
|
||||
};
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
|
|
@ -0,0 +1,288 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorExecutor
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief The tensor executor class.
|
||||
*
|
||||
* This class is responsible for launch the evaluation of the expression on
|
||||
* the specified computing device.
|
||||
*/
|
||||
namespace internal {
|
||||
|
||||
// Default strategy: the expression is evaluated with a single cpu thread.
|
||||
template<typename Expression, typename Device, bool Vectorizable>
|
||||
class TensorExecutor
|
||||
{
|
||||
public:
|
||||
typedef typename Expression::Index Index;
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline void run(const Expression& expr, const Device& device = Device())
|
||||
{
|
||||
TensorEvaluator<Expression, Device> evaluator(expr, device);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign)
|
||||
{
|
||||
const Index size = array_prod(evaluator.dimensions());
|
||||
for (Index i = 0; i < size; ++i) {
|
||||
evaluator.evalScalar(i);
|
||||
}
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<typename Expression>
|
||||
class TensorExecutor<Expression, DefaultDevice, true>
|
||||
{
|
||||
public:
|
||||
typedef typename Expression::Index Index;
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
|
||||
{
|
||||
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign)
|
||||
{
|
||||
const Index size = array_prod(evaluator.dimensions());
|
||||
const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
|
||||
// Give the compiler a strong hint to unroll the loop. But don't insist
|
||||
// on unrolling, because if the function is expensive the compiler should not
|
||||
// unroll the loop at the expense of inlining.
|
||||
const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
|
||||
for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
|
||||
for (Index j = 0; j < 4; j++) {
|
||||
evaluator.evalPacket(i + j * PacketSize);
|
||||
}
|
||||
}
|
||||
const Index VectorizedSize = (size / PacketSize) * PacketSize;
|
||||
for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
|
||||
evaluator.evalPacket(i);
|
||||
}
|
||||
for (Index i = VectorizedSize; i < size; ++i) {
|
||||
evaluator.evalScalar(i);
|
||||
}
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
// Multicore strategy: the index space is partitioned and each partition is executed on a single core
|
||||
#ifdef EIGEN_USE_THREADS
|
||||
template <typename Evaluator, typename Index, bool Vectorizable>
|
||||
struct EvalRange {
|
||||
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
|
||||
Evaluator evaluator = *evaluator_in;
|
||||
eigen_assert(last >= first);
|
||||
for (Index i = first; i < last; ++i) {
|
||||
evaluator.evalScalar(i);
|
||||
}
|
||||
}
|
||||
|
||||
static Index alignBlockSize(Index size) {
|
||||
return size;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Evaluator, typename Index>
|
||||
struct EvalRange<Evaluator, Index, true> {
|
||||
static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||
|
||||
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
|
||||
Evaluator evaluator = *evaluator_in;
|
||||
eigen_assert(last >= first);
|
||||
Index i = first;
|
||||
if (last - first >= PacketSize) {
|
||||
eigen_assert(first % PacketSize == 0);
|
||||
Index last_chunk_offset = last - 4 * PacketSize;
|
||||
// Give the compiler a strong hint to unroll the loop. But don't insist
|
||||
// on unrolling, because if the function is expensive the compiler should not
|
||||
// unroll the loop at the expense of inlining.
|
||||
for (; i <= last_chunk_offset; i += 4*PacketSize) {
|
||||
for (Index j = 0; j < 4; j++) {
|
||||
evaluator.evalPacket(i + j * PacketSize);
|
||||
}
|
||||
}
|
||||
last_chunk_offset = last - PacketSize;
|
||||
for (; i <= last_chunk_offset; i += PacketSize) {
|
||||
evaluator.evalPacket(i);
|
||||
}
|
||||
}
|
||||
for (; i < last; ++i) {
|
||||
evaluator.evalScalar(i);
|
||||
}
|
||||
}
|
||||
|
||||
static Index alignBlockSize(Index size) {
|
||||
// Align block size to packet size and account for unrolling in run above.
|
||||
if (size >= 16 * PacketSize) {
|
||||
return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
|
||||
}
|
||||
// Aligning to 4 * PacketSize would increase block size by more than 25%.
|
||||
return (size + PacketSize - 1) & ~(PacketSize - 1);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
|
||||
public:
|
||||
typedef typename Expression::Index Index;
|
||||
static inline void run(const Expression& expr, const ThreadPoolDevice& device)
|
||||
{
|
||||
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
||||
Evaluator evaluator(expr, device);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign)
|
||||
{
|
||||
const Index size = array_prod(evaluator.dimensions());
|
||||
#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
|
||||
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
|
||||
EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
|
||||
[&evaluator](Index first, Index last) {
|
||||
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
|
||||
});
|
||||
#else
|
||||
size_t num_threads = device.numThreads();
|
||||
if (num_threads > 1) {
|
||||
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
|
||||
size, evaluator.costPerCoeff(Vectorizable), num_threads);
|
||||
}
|
||||
if (num_threads == 1) {
|
||||
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
|
||||
} else {
|
||||
const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
|
||||
Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
|
||||
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
|
||||
const Index numblocks = size / blocksize;
|
||||
|
||||
Barrier barrier(numblocks);
|
||||
for (int i = 0; i < numblocks; ++i) {
|
||||
device.enqueue_with_barrier(
|
||||
&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
|
||||
&evaluator, i * blocksize, (i + 1) * blocksize);
|
||||
}
|
||||
if (numblocks * blocksize < size) {
|
||||
EvalRange<Evaluator, Index, Vectorizable>::run(
|
||||
&evaluator, numblocks * blocksize, size);
|
||||
}
|
||||
barrier.Wait();
|
||||
}
|
||||
#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
};
|
||||
#endif // EIGEN_USE_THREADS
|
||||
|
||||
|
||||
// GPU: the evaluation of the expression is offloaded to a GPU.
|
||||
#if defined(EIGEN_USE_GPU)
|
||||
|
||||
template <typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, GpuDevice, Vectorizable> {
|
||||
public:
|
||||
typedef typename Expression::Index Index;
|
||||
static void run(const Expression& expr, const GpuDevice& device);
|
||||
};
|
||||
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
template <typename Evaluator, typename Index, bool Vectorizable>
|
||||
struct EigenMetaKernelEval {
|
||||
static __device__ EIGEN_ALWAYS_INLINE
|
||||
void run(Evaluator& eval, Index first, Index last, Index step_size) {
|
||||
for (Index i = first; i < last; i += step_size) {
|
||||
eval.evalScalar(i);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Evaluator, typename Index>
|
||||
struct EigenMetaKernelEval<Evaluator, Index, true> {
|
||||
static __device__ EIGEN_ALWAYS_INLINE
|
||||
void run(Evaluator& eval, Index first, Index last, Index step_size) {
|
||||
const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||
const Index vectorized_size = (last / PacketSize) * PacketSize;
|
||||
const Index vectorized_step_size = step_size * PacketSize;
|
||||
|
||||
// Use the vector path
|
||||
for (Index i = first * PacketSize; i < vectorized_size;
|
||||
i += vectorized_step_size) {
|
||||
eval.evalPacket(i);
|
||||
}
|
||||
for (Index i = vectorized_size + first; i < last; i += step_size) {
|
||||
eval.evalScalar(i);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Evaluator, typename Index>
|
||||
__global__ void
|
||||
__launch_bounds__(1024)
|
||||
EigenMetaKernel(Evaluator eval, Index size) {
|
||||
|
||||
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const Index step_size = blockDim.x * gridDim.x;
|
||||
|
||||
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
|
||||
EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
|
||||
}
|
||||
|
||||
/*static*/
|
||||
template <typename Expression, bool Vectorizable>
|
||||
inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
|
||||
const Expression& expr, const GpuDevice& device) {
|
||||
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign) {
|
||||
const int block_size = device.maxCudaThreadsPerBlock();
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||
device.maxCudaThreadsPerMultiProcessor() / block_size;
|
||||
const Index size = array_prod(evaluator.dimensions());
|
||||
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
|
||||
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
|
||||
|
||||
LAUNCH_CUDA_KERNEL(
|
||||
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
|
||||
num_blocks, block_size, 0, device, evaluator, size);
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
|
||||
#endif // __CUDACC__
|
||||
#endif // EIGEN_USE_GPU
|
||||
|
||||
// SYCL Executor policy
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
|
||||
template <typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, SyclDevice, Vectorizable> {
|
||||
public:
|
||||
static inline void run(const Expression &expr, const SyclDevice &device) {
|
||||
// call TensorSYCL module
|
||||
TensorSycl::run(expr, device);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
|
|
@ -0,0 +1,371 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorExpr
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor expression classes.
|
||||
*
|
||||
* The TensorCwiseNullaryOp class applies a nullary operators to an expression.
|
||||
* This is typically used to generate constants.
|
||||
*
|
||||
* The TensorCwiseUnaryOp class represents an expression where a unary operator
|
||||
* (e.g. cwiseSqrt) is applied to an expression.
|
||||
*
|
||||
* The TensorCwiseBinaryOp class represents an expression where a binary
|
||||
* operator (e.g. addition) is applied to a lhs and a rhs expression.
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename NullaryOp, typename XprType>
|
||||
struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> >
|
||||
: traits<XprType>
|
||||
{
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::Nested XprTypeNested;
|
||||
typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
|
||||
enum {
|
||||
Flags = 0
|
||||
};
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename NullaryOp, typename XprType>
|
||||
class TensorCwiseNullaryOp : public TensorBase<TensorCwiseNullaryOp<NullaryOp, XprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef TensorCwiseNullaryOp<NullaryOp, XprType> Nested;
|
||||
typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp())
|
||||
: m_xpr(xpr), m_functor(func) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
nestedExpression() const { return m_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const NullaryOp& functor() const { return m_functor; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const NullaryOp m_functor;
|
||||
};
|
||||
|
||||
|
||||
|
||||
namespace internal {
|
||||
template<typename UnaryOp, typename XprType>
|
||||
struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> >
|
||||
: traits<XprType>
|
||||
{
|
||||
// TODO(phli): Add InputScalar, InputPacket. Check references to
|
||||
// current Scalar/Packet to see if the intent is Input or Output.
|
||||
typedef typename result_of<UnaryOp(typename XprType::Scalar)>::type Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprType::Nested XprTypeNested;
|
||||
typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<typename UnaryOp, typename XprType>
|
||||
struct eval<TensorCwiseUnaryOp<UnaryOp, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorCwiseUnaryOp<UnaryOp, XprType>& type;
|
||||
};
|
||||
|
||||
template<typename UnaryOp, typename XprType>
|
||||
struct nested<TensorCwiseUnaryOp<UnaryOp, XprType>, 1, typename eval<TensorCwiseUnaryOp<UnaryOp, XprType> >::type>
|
||||
{
|
||||
typedef TensorCwiseUnaryOp<UnaryOp, XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename UnaryOp, typename XprType>
|
||||
class TensorCwiseUnaryOp : public TensorBase<TensorCwiseUnaryOp<UnaryOp, XprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
// TODO(phli): Add InputScalar, InputPacket. Check references to
|
||||
// current Scalar/Packet to see if the intent is Input or Output.
|
||||
typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef Scalar CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorCwiseUnaryOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
|
||||
: m_xpr(xpr), m_functor(func) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const UnaryOp& functor() const { return m_functor; }
|
||||
|
||||
/** \returns the nested expression */
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
nestedExpression() const { return m_xpr; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const UnaryOp m_functor;
|
||||
};
|
||||
|
||||
|
||||
namespace internal {
|
||||
template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
|
||||
struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
|
||||
{
|
||||
// Type promotion to handle the case where the types of the lhs and the rhs
|
||||
// are different.
|
||||
// TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to
|
||||
// current Scalar/Packet to see if the intent is Inputs or Output.
|
||||
typedef typename result_of<
|
||||
BinaryOp(typename LhsXprType::Scalar,
|
||||
typename RhsXprType::Scalar)>::type Scalar;
|
||||
typedef traits<LhsXprType> XprTraits;
|
||||
typedef typename promote_storage_type<
|
||||
typename traits<LhsXprType>::StorageKind,
|
||||
typename traits<RhsXprType>::StorageKind>::ret StorageKind;
|
||||
typedef typename promote_index_type<
|
||||
typename traits<LhsXprType>::Index,
|
||||
typename traits<RhsXprType>::Index>::type Index;
|
||||
typedef typename LhsXprType::Nested LhsNested;
|
||||
typedef typename RhsXprType::Nested RhsNested;
|
||||
typedef typename remove_reference<LhsNested>::type _LhsNested;
|
||||
typedef typename remove_reference<RhsNested>::type _RhsNested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
|
||||
enum {
|
||||
Flags = 0
|
||||
};
|
||||
};
|
||||
|
||||
template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
|
||||
struct eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>& type;
|
||||
};
|
||||
|
||||
template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
|
||||
struct nested<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, 1, typename eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >::type>
|
||||
{
|
||||
typedef TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
|
||||
class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
// TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to
|
||||
// current Scalar/Packet to see if the intent is Inputs or Output.
|
||||
typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef Scalar CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorCwiseBinaryOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp())
|
||||
: m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const BinaryOp& functor() const { return m_functor; }
|
||||
|
||||
/** \returns the nested expressions */
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename LhsXprType::Nested>::type&
|
||||
lhsExpression() const { return m_lhs_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename RhsXprType::Nested>::type&
|
||||
rhsExpression() const { return m_rhs_xpr; }
|
||||
|
||||
protected:
|
||||
typename LhsXprType::Nested m_lhs_xpr;
|
||||
typename RhsXprType::Nested m_rhs_xpr;
|
||||
const BinaryOp m_functor;
|
||||
};
|
||||
|
||||
|
||||
namespace internal {
|
||||
template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
|
||||
struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >
|
||||
{
|
||||
// Type promotion to handle the case where the types of the args are different.
|
||||
typedef typename result_of<
|
||||
TernaryOp(typename Arg1XprType::Scalar,
|
||||
typename Arg2XprType::Scalar,
|
||||
typename Arg3XprType::Scalar)>::type Scalar;
|
||||
typedef traits<Arg1XprType> XprTraits;
|
||||
typedef typename traits<Arg1XprType>::StorageKind StorageKind;
|
||||
typedef typename traits<Arg1XprType>::Index Index;
|
||||
typedef typename Arg1XprType::Nested Arg1Nested;
|
||||
typedef typename Arg2XprType::Nested Arg2Nested;
|
||||
typedef typename Arg3XprType::Nested Arg3Nested;
|
||||
typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
|
||||
typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
|
||||
typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
|
||||
enum {
|
||||
Flags = 0
|
||||
};
|
||||
};
|
||||
|
||||
template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
|
||||
struct eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>& type;
|
||||
};
|
||||
|
||||
template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
|
||||
struct nested<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, 1, typename eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >::type>
|
||||
{
|
||||
typedef TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
|
||||
class TensorCwiseTernaryOp : public TensorBase<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef Scalar CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorCwiseTernaryOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp())
|
||||
: m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const TernaryOp& functor() const { return m_functor; }
|
||||
|
||||
/** \returns the nested expressions */
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename Arg1XprType::Nested>::type&
|
||||
arg1Expression() const { return m_arg1_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename Arg2XprType::Nested>::type&
|
||||
arg2Expression() const { return m_arg2_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename Arg3XprType::Nested>::type&
|
||||
arg3Expression() const { return m_arg3_xpr; }
|
||||
|
||||
protected:
|
||||
typename Arg1XprType::Nested m_arg1_xpr;
|
||||
typename Arg2XprType::Nested m_arg2_xpr;
|
||||
typename Arg3XprType::Nested m_arg3_xpr;
|
||||
const TernaryOp m_functor;
|
||||
};
|
||||
|
||||
|
||||
namespace internal {
|
||||
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
|
||||
struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
|
||||
: traits<ThenXprType>
|
||||
{
|
||||
typedef typename traits<ThenXprType>::Scalar Scalar;
|
||||
typedef traits<ThenXprType> XprTraits;
|
||||
typedef typename promote_storage_type<typename traits<ThenXprType>::StorageKind,
|
||||
typename traits<ElseXprType>::StorageKind>::ret StorageKind;
|
||||
typedef typename promote_index_type<typename traits<ElseXprType>::Index,
|
||||
typename traits<ThenXprType>::Index>::type Index;
|
||||
typedef typename IfXprType::Nested IfNested;
|
||||
typedef typename ThenXprType::Nested ThenNested;
|
||||
typedef typename ElseXprType::Nested ElseNested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
|
||||
struct eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorSelectOp<IfXprType, ThenXprType, ElseXprType>& type;
|
||||
};
|
||||
|
||||
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
|
||||
struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >::type>
|
||||
{
|
||||
typedef TensorSelectOp<IfXprType, ThenXprType, ElseXprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
|
||||
class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename internal::promote_storage_type<typename ThenXprType::CoeffReturnType,
|
||||
typename ElseXprType::CoeffReturnType>::ret CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorSelectOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorSelectOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorSelectOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
TensorSelectOp(const IfXprType& a_condition,
|
||||
const ThenXprType& a_then,
|
||||
const ElseXprType& a_else)
|
||||
: m_condition(a_condition), m_then(a_then), m_else(a_else)
|
||||
{ }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const IfXprType& ifExpression() const { return m_condition; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const ThenXprType& thenExpression() const { return m_then; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const ElseXprType& elseExpression() const { return m_else; }
|
||||
|
||||
protected:
|
||||
typename IfXprType::Nested m_condition;
|
||||
typename ThenXprType::Nested m_then;
|
||||
typename ElseXprType::Nested m_else;
|
||||
};
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
|
|
@ -0,0 +1,651 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Jianwei Cui <thucjw@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H
|
||||
|
||||
// This code requires the ability to initialize arrays of constant
|
||||
// values directly inside a class.
|
||||
#if __cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1900
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorFFT
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor FFT class.
|
||||
*
|
||||
* TODO:
|
||||
* Vectorize the Cooley Tukey and the Bluestein algorithm
|
||||
* Add support for multithreaded evaluation
|
||||
* Improve the performance on GPU
|
||||
*/
|
||||
|
||||
template <bool NeedUprade> struct MakeComplex {
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
T operator() (const T& val) const { return val; }
|
||||
};
|
||||
|
||||
template <> struct MakeComplex<true> {
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
std::complex<T> operator() (const T& val) const { return std::complex<T>(val, 0); }
|
||||
};
|
||||
|
||||
template <> struct MakeComplex<false> {
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
std::complex<T> operator() (const std::complex<T>& val) const { return val; }
|
||||
};
|
||||
|
||||
template <int ResultType> struct PartOf {
|
||||
template <typename T> T operator() (const T& val) const { return val; }
|
||||
};
|
||||
|
||||
template <> struct PartOf<RealPart> {
|
||||
template <typename T> T operator() (const std::complex<T>& val) const { return val.real(); }
|
||||
};
|
||||
|
||||
template <> struct PartOf<ImagPart> {
|
||||
template <typename T> T operator() (const std::complex<T>& val) const { return val.imag(); }
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
|
||||
struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits<XprType> {
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename NumTraits<typename XprTraits::Scalar>::Real RealScalar;
|
||||
typedef typename std::complex<RealScalar> ComplexScalar;
|
||||
typedef typename XprTraits::Scalar InputScalar;
|
||||
typedef typename conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
|
||||
struct eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, Eigen::Dense> {
|
||||
typedef const TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>& type;
|
||||
};
|
||||
|
||||
template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
|
||||
struct nested<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, 1, typename eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> >::type> {
|
||||
typedef TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
|
||||
class TensorFFTOp : public TensorBase<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir>, ReadOnlyAccessors> {
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorFFTOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename std::complex<RealScalar> ComplexScalar;
|
||||
typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
|
||||
typedef OutputScalar CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorFFTOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorFFTOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorFFTOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft)
|
||||
: m_xpr(expr), m_fft(fft) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const FFT& fft() const { return m_fft; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type& expression() const {
|
||||
return m_xpr;
|
||||
}
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const FFT m_fft;
|
||||
};
|
||||
|
||||
// Eval as rvalue
|
||||
template <typename FFT, typename ArgType, typename Device, int FFTResultType, int FFTDir>
|
||||
struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, Device> {
|
||||
typedef TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename std::complex<RealScalar> ComplexScalar;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
|
||||
typedef internal::traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::Scalar InputScalar;
|
||||
typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
|
||||
typedef OutputScalar CoeffReturnType;
|
||||
typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = true,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
eigen_assert(input_dims[i] > 0);
|
||||
m_dimensions[i] = input_dims[i];
|
||||
}
|
||||
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_strides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
|
||||
}
|
||||
} else {
|
||||
m_strides[NumDims - 1] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
|
||||
}
|
||||
}
|
||||
m_size = m_dimensions.TotalSize();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
|
||||
return m_dimensions;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
if (data) {
|
||||
evalToBuf(data);
|
||||
return false;
|
||||
} else {
|
||||
m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size);
|
||||
evalToBuf(m_data);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
if (m_data) {
|
||||
m_device.deallocate(m_data);
|
||||
m_data = NULL;
|
||||
}
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const {
|
||||
return m_data[index];
|
||||
}
|
||||
|
||||
template <int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType
|
||||
packet(Index index) const {
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
|
||||
|
||||
|
||||
private:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) {
|
||||
const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
|
||||
ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size);
|
||||
|
||||
for (Index i = 0; i < m_size; ++i) {
|
||||
buf[i] = MakeComplex<internal::is_same<InputScalar, RealScalar>::value>()(m_impl.coeff(i));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < m_fft.size(); ++i) {
|
||||
Index dim = m_fft[i];
|
||||
eigen_assert(dim >= 0 && dim < NumDims);
|
||||
Index line_len = m_dimensions[dim];
|
||||
eigen_assert(line_len >= 1);
|
||||
ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len);
|
||||
const bool is_power_of_two = isPowerOfTwo(line_len);
|
||||
const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len);
|
||||
const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite);
|
||||
|
||||
ComplexScalar* a = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
|
||||
ComplexScalar* b = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
|
||||
ComplexScalar* pos_j_base_powered = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1));
|
||||
if (!is_power_of_two) {
|
||||
// Compute twiddle factors
|
||||
// t_n = exp(sqrt(-1) * pi * n^2 / line_len)
|
||||
// for n = 0, 1,..., line_len-1.
|
||||
// For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2
|
||||
pos_j_base_powered[0] = ComplexScalar(1, 0);
|
||||
if (line_len > 1) {
|
||||
const RealScalar pi_over_len(EIGEN_PI / line_len);
|
||||
const ComplexScalar pos_j_base = ComplexScalar(
|
||||
std::cos(pi_over_len), std::sin(pi_over_len));
|
||||
pos_j_base_powered[1] = pos_j_base;
|
||||
if (line_len > 2) {
|
||||
const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
|
||||
for (int j = 2; j < line_len + 1; ++j) {
|
||||
pos_j_base_powered[j] = pos_j_base_powered[j - 1] *
|
||||
pos_j_base_powered[j - 1] /
|
||||
pos_j_base_powered[j - 2] * pos_j_base_sq;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) {
|
||||
const Index base_offset = getBaseOffsetFromIndex(partial_index, dim);
|
||||
|
||||
// get data into line_buf
|
||||
const Index stride = m_strides[dim];
|
||||
if (stride == 1) {
|
||||
memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
|
||||
} else {
|
||||
Index offset = base_offset;
|
||||
for (int j = 0; j < line_len; ++j, offset += stride) {
|
||||
line_buf[j] = buf[offset];
|
||||
}
|
||||
}
|
||||
|
||||
// processs the line
|
||||
if (is_power_of_two) {
|
||||
processDataLineCooleyTukey(line_buf, line_len, log_len);
|
||||
}
|
||||
else {
|
||||
processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered);
|
||||
}
|
||||
|
||||
// write back
|
||||
if (FFTDir == FFT_FORWARD && stride == 1) {
|
||||
memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
|
||||
} else {
|
||||
Index offset = base_offset;
|
||||
const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0);
|
||||
for (int j = 0; j < line_len; ++j, offset += stride) {
|
||||
buf[offset] = (FFTDir == FFT_FORWARD) ? line_buf[j] : line_buf[j] * div_factor;
|
||||
}
|
||||
}
|
||||
}
|
||||
m_device.deallocate(line_buf);
|
||||
if (!is_power_of_two) {
|
||||
m_device.deallocate(a);
|
||||
m_device.deallocate(b);
|
||||
m_device.deallocate(pos_j_base_powered);
|
||||
}
|
||||
}
|
||||
|
||||
if(!write_to_out) {
|
||||
for (Index i = 0; i < m_size; ++i) {
|
||||
data[i] = PartOf<FFTResultType>()(buf[i]);
|
||||
}
|
||||
m_device.deallocate(buf);
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) {
|
||||
eigen_assert(x > 0);
|
||||
return !(x & (x - 1));
|
||||
}
|
||||
|
||||
// The composite number for padding, used in Bluestein's FFT algorithm
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) {
|
||||
Index i = 2;
|
||||
while (i < 2 * n - 1) i *= 2;
|
||||
return i;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) {
|
||||
Index log2m = 0;
|
||||
while (m >>= 1) log2m++;
|
||||
return log2m;
|
||||
}
|
||||
|
||||
// Call Cooley Tukey algorithm directly, data length must be power of 2
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len, Index log_len) {
|
||||
eigen_assert(isPowerOfTwo(line_len));
|
||||
scramble_FFT(line_buf, line_len);
|
||||
compute_1D_Butterfly<FFTDir>(line_buf, line_len, log_len);
|
||||
}
|
||||
|
||||
// Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len, Index good_composite, Index log_len, ComplexScalar* a, ComplexScalar* b, const ComplexScalar* pos_j_base_powered) {
|
||||
Index n = line_len;
|
||||
Index m = good_composite;
|
||||
ComplexScalar* data = line_buf;
|
||||
|
||||
for (Index i = 0; i < n; ++i) {
|
||||
if(FFTDir == FFT_FORWARD) {
|
||||
a[i] = data[i] * numext::conj(pos_j_base_powered[i]);
|
||||
}
|
||||
else {
|
||||
a[i] = data[i] * pos_j_base_powered[i];
|
||||
}
|
||||
}
|
||||
for (Index i = n; i < m; ++i) {
|
||||
a[i] = ComplexScalar(0, 0);
|
||||
}
|
||||
|
||||
for (Index i = 0; i < n; ++i) {
|
||||
if(FFTDir == FFT_FORWARD) {
|
||||
b[i] = pos_j_base_powered[i];
|
||||
}
|
||||
else {
|
||||
b[i] = numext::conj(pos_j_base_powered[i]);
|
||||
}
|
||||
}
|
||||
for (Index i = n; i < m - n; ++i) {
|
||||
b[i] = ComplexScalar(0, 0);
|
||||
}
|
||||
for (Index i = m - n; i < m; ++i) {
|
||||
if(FFTDir == FFT_FORWARD) {
|
||||
b[i] = pos_j_base_powered[m-i];
|
||||
}
|
||||
else {
|
||||
b[i] = numext::conj(pos_j_base_powered[m-i]);
|
||||
}
|
||||
}
|
||||
|
||||
scramble_FFT(a, m);
|
||||
compute_1D_Butterfly<FFT_FORWARD>(a, m, log_len);
|
||||
|
||||
scramble_FFT(b, m);
|
||||
compute_1D_Butterfly<FFT_FORWARD>(b, m, log_len);
|
||||
|
||||
for (Index i = 0; i < m; ++i) {
|
||||
a[i] *= b[i];
|
||||
}
|
||||
|
||||
scramble_FFT(a, m);
|
||||
compute_1D_Butterfly<FFT_REVERSE>(a, m, log_len);
|
||||
|
||||
//Do the scaling after ifft
|
||||
for (Index i = 0; i < m; ++i) {
|
||||
a[i] /= m;
|
||||
}
|
||||
|
||||
for (Index i = 0; i < n; ++i) {
|
||||
if(FFTDir == FFT_FORWARD) {
|
||||
data[i] = a[i] * numext::conj(pos_j_base_powered[i]);
|
||||
}
|
||||
else {
|
||||
data[i] = a[i] * pos_j_base_powered[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) {
|
||||
eigen_assert(isPowerOfTwo(n));
|
||||
Index j = 1;
|
||||
for (Index i = 1; i < n; ++i){
|
||||
if (j > i) {
|
||||
std::swap(data[j-1], data[i-1]);
|
||||
}
|
||||
Index m = n >> 1;
|
||||
while (m >= 2 && j > m) {
|
||||
j -= m;
|
||||
m >>= 1;
|
||||
}
|
||||
j += m;
|
||||
}
|
||||
}
|
||||
|
||||
template <int Dir>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_2(ComplexScalar* data) {
|
||||
ComplexScalar tmp = data[1];
|
||||
data[1] = data[0] - data[1];
|
||||
data[0] += tmp;
|
||||
}
|
||||
|
||||
template <int Dir>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_4(ComplexScalar* data) {
|
||||
ComplexScalar tmp[4];
|
||||
tmp[0] = data[0] + data[1];
|
||||
tmp[1] = data[0] - data[1];
|
||||
tmp[2] = data[2] + data[3];
|
||||
if (Dir == FFT_FORWARD) {
|
||||
tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]);
|
||||
} else {
|
||||
tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]);
|
||||
}
|
||||
data[0] = tmp[0] + tmp[2];
|
||||
data[1] = tmp[1] + tmp[3];
|
||||
data[2] = tmp[0] - tmp[2];
|
||||
data[3] = tmp[1] - tmp[3];
|
||||
}
|
||||
|
||||
template <int Dir>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_8(ComplexScalar* data) {
|
||||
ComplexScalar tmp_1[8];
|
||||
ComplexScalar tmp_2[8];
|
||||
|
||||
tmp_1[0] = data[0] + data[1];
|
||||
tmp_1[1] = data[0] - data[1];
|
||||
tmp_1[2] = data[2] + data[3];
|
||||
if (Dir == FFT_FORWARD) {
|
||||
tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1);
|
||||
} else {
|
||||
tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1);
|
||||
}
|
||||
tmp_1[4] = data[4] + data[5];
|
||||
tmp_1[5] = data[4] - data[5];
|
||||
tmp_1[6] = data[6] + data[7];
|
||||
if (Dir == FFT_FORWARD) {
|
||||
tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1);
|
||||
} else {
|
||||
tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1);
|
||||
}
|
||||
tmp_2[0] = tmp_1[0] + tmp_1[2];
|
||||
tmp_2[1] = tmp_1[1] + tmp_1[3];
|
||||
tmp_2[2] = tmp_1[0] - tmp_1[2];
|
||||
tmp_2[3] = tmp_1[1] - tmp_1[3];
|
||||
tmp_2[4] = tmp_1[4] + tmp_1[6];
|
||||
// SQRT2DIV2 = sqrt(2)/2
|
||||
#define SQRT2DIV2 0.7071067811865476
|
||||
if (Dir == FFT_FORWARD) {
|
||||
tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2);
|
||||
tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1);
|
||||
tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2);
|
||||
} else {
|
||||
tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2);
|
||||
tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1);
|
||||
tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2);
|
||||
}
|
||||
data[0] = tmp_2[0] + tmp_2[4];
|
||||
data[1] = tmp_2[1] + tmp_2[5];
|
||||
data[2] = tmp_2[2] + tmp_2[6];
|
||||
data[3] = tmp_2[3] + tmp_2[7];
|
||||
data[4] = tmp_2[0] - tmp_2[4];
|
||||
data[5] = tmp_2[1] - tmp_2[5];
|
||||
data[6] = tmp_2[2] - tmp_2[6];
|
||||
data[7] = tmp_2[3] - tmp_2[7];
|
||||
}
|
||||
|
||||
template <int Dir>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge(
|
||||
ComplexScalar* data, Index n, Index n_power_of_2) {
|
||||
// Original code:
|
||||
// RealScalar wtemp = std::sin(M_PI/n);
|
||||
// RealScalar wpi = -std::sin(2 * M_PI/n);
|
||||
const RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2];
|
||||
const RealScalar wpi = (Dir == FFT_FORWARD)
|
||||
? m_minus_sin_2_PI_div_n_LUT[n_power_of_2]
|
||||
: -m_minus_sin_2_PI_div_n_LUT[n_power_of_2];
|
||||
|
||||
const ComplexScalar wp(wtemp, wpi);
|
||||
const ComplexScalar wp_one = wp + ComplexScalar(1, 0);
|
||||
const ComplexScalar wp_one_2 = wp_one * wp_one;
|
||||
const ComplexScalar wp_one_3 = wp_one_2 * wp_one;
|
||||
const ComplexScalar wp_one_4 = wp_one_3 * wp_one;
|
||||
const Index n2 = n / 2;
|
||||
ComplexScalar w(1.0, 0.0);
|
||||
for (Index i = 0; i < n2; i += 4) {
|
||||
ComplexScalar temp0(data[i + n2] * w);
|
||||
ComplexScalar temp1(data[i + 1 + n2] * w * wp_one);
|
||||
ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2);
|
||||
ComplexScalar temp3(data[i + 3 + n2] * w * wp_one_3);
|
||||
w = w * wp_one_4;
|
||||
|
||||
data[i + n2] = data[i] - temp0;
|
||||
data[i] += temp0;
|
||||
|
||||
data[i + 1 + n2] = data[i + 1] - temp1;
|
||||
data[i + 1] += temp1;
|
||||
|
||||
data[i + 2 + n2] = data[i + 2] - temp2;
|
||||
data[i + 2] += temp2;
|
||||
|
||||
data[i + 3 + n2] = data[i + 3] - temp3;
|
||||
data[i + 3] += temp3;
|
||||
}
|
||||
}
|
||||
|
||||
template <int Dir>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(
|
||||
ComplexScalar* data, Index n, Index n_power_of_2) {
|
||||
eigen_assert(isPowerOfTwo(n));
|
||||
if (n > 8) {
|
||||
compute_1D_Butterfly<Dir>(data, n / 2, n_power_of_2 - 1);
|
||||
compute_1D_Butterfly<Dir>(data + n / 2, n / 2, n_power_of_2 - 1);
|
||||
butterfly_1D_merge<Dir>(data, n, n_power_of_2);
|
||||
} else if (n == 8) {
|
||||
butterfly_8<Dir>(data);
|
||||
} else if (n == 4) {
|
||||
butterfly_4<Dir>(data);
|
||||
} else if (n == 2) {
|
||||
butterfly_2<Dir>(data);
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const {
|
||||
Index result = 0;
|
||||
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > omitted_dim; --i) {
|
||||
const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
|
||||
const Index idx = index / partial_m_stride;
|
||||
index -= idx * partial_m_stride;
|
||||
result += idx * m_strides[i];
|
||||
}
|
||||
result += index;
|
||||
}
|
||||
else {
|
||||
for (Index i = 0; i < omitted_dim; ++i) {
|
||||
const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
|
||||
const Index idx = index / partial_m_stride;
|
||||
index -= idx * partial_m_stride;
|
||||
result += idx * m_strides[i];
|
||||
}
|
||||
result += index;
|
||||
}
|
||||
// Value of index_coords[omitted_dim] is not determined to this step
|
||||
return result;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const {
|
||||
Index result = base + offset * m_strides[omitted_dim] ;
|
||||
return result;
|
||||
}
|
||||
|
||||
protected:
|
||||
Index m_size;
|
||||
const FFT& m_fft;
|
||||
Dimensions m_dimensions;
|
||||
array<Index, NumDims> m_strides;
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
CoeffReturnType* m_data;
|
||||
const Device& m_device;
|
||||
|
||||
// This will support a maximum FFT size of 2^32 for each dimension
|
||||
// m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2;
|
||||
const RealScalar m_sin_PI_div_n_LUT[32] = {
|
||||
RealScalar(0.0),
|
||||
RealScalar(-2),
|
||||
RealScalar(-0.999999999999999),
|
||||
RealScalar(-0.292893218813453),
|
||||
RealScalar(-0.0761204674887130),
|
||||
RealScalar(-0.0192147195967696),
|
||||
RealScalar(-0.00481527332780311),
|
||||
RealScalar(-0.00120454379482761),
|
||||
RealScalar(-3.01181303795779e-04),
|
||||
RealScalar(-7.52981608554592e-05),
|
||||
RealScalar(-1.88247173988574e-05),
|
||||
RealScalar(-4.70619042382852e-06),
|
||||
RealScalar(-1.17654829809007e-06),
|
||||
RealScalar(-2.94137117780840e-07),
|
||||
RealScalar(-7.35342821488550e-08),
|
||||
RealScalar(-1.83835707061916e-08),
|
||||
RealScalar(-4.59589268710903e-09),
|
||||
RealScalar(-1.14897317243732e-09),
|
||||
RealScalar(-2.87243293150586e-10),
|
||||
RealScalar( -7.18108232902250e-11),
|
||||
RealScalar(-1.79527058227174e-11),
|
||||
RealScalar(-4.48817645568941e-12),
|
||||
RealScalar(-1.12204411392298e-12),
|
||||
RealScalar(-2.80511028480785e-13),
|
||||
RealScalar(-7.01277571201985e-14),
|
||||
RealScalar(-1.75319392800498e-14),
|
||||
RealScalar(-4.38298482001247e-15),
|
||||
RealScalar(-1.09574620500312e-15),
|
||||
RealScalar(-2.73936551250781e-16),
|
||||
RealScalar(-6.84841378126949e-17),
|
||||
RealScalar(-1.71210344531737e-17),
|
||||
RealScalar(-4.28025861329343e-18)
|
||||
};
|
||||
|
||||
// m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i));
|
||||
const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = {
|
||||
RealScalar(0.0),
|
||||
RealScalar(0.0),
|
||||
RealScalar(-1.00000000000000e+00),
|
||||
RealScalar(-7.07106781186547e-01),
|
||||
RealScalar(-3.82683432365090e-01),
|
||||
RealScalar(-1.95090322016128e-01),
|
||||
RealScalar(-9.80171403295606e-02),
|
||||
RealScalar(-4.90676743274180e-02),
|
||||
RealScalar(-2.45412285229123e-02),
|
||||
RealScalar(-1.22715382857199e-02),
|
||||
RealScalar(-6.13588464915448e-03),
|
||||
RealScalar(-3.06795676296598e-03),
|
||||
RealScalar(-1.53398018628477e-03),
|
||||
RealScalar(-7.66990318742704e-04),
|
||||
RealScalar(-3.83495187571396e-04),
|
||||
RealScalar(-1.91747597310703e-04),
|
||||
RealScalar(-9.58737990959773e-05),
|
||||
RealScalar(-4.79368996030669e-05),
|
||||
RealScalar(-2.39684498084182e-05),
|
||||
RealScalar(-1.19842249050697e-05),
|
||||
RealScalar(-5.99211245264243e-06),
|
||||
RealScalar(-2.99605622633466e-06),
|
||||
RealScalar(-1.49802811316901e-06),
|
||||
RealScalar(-7.49014056584716e-07),
|
||||
RealScalar(-3.74507028292384e-07),
|
||||
RealScalar(-1.87253514146195e-07),
|
||||
RealScalar(-9.36267570730981e-08),
|
||||
RealScalar(-4.68133785365491e-08),
|
||||
RealScalar(-2.34066892682746e-08),
|
||||
RealScalar(-1.17033446341373e-08),
|
||||
RealScalar(-5.85167231706864e-09),
|
||||
RealScalar(-2.92583615853432e-09)
|
||||
};
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_HAS_CONSTEXPR
|
||||
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_FFT_H
|
|
@ -0,0 +1,389 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorFixedSize
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief The fixed sized version of the tensor class.
|
||||
*
|
||||
* The fixed sized equivalent of
|
||||
* Eigen::Tensor<float, 3> t(3, 5, 7);
|
||||
* is
|
||||
* Eigen::TensorFixedSize<float, Size<3,5,7>> t;
|
||||
*/
|
||||
|
||||
template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
|
||||
class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> >
|
||||
{
|
||||
public:
|
||||
typedef TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> Self;
|
||||
typedef TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> > Base;
|
||||
typedef typename Eigen::internal::nested<Self>::type Nested;
|
||||
typedef typename internal::traits<Self>::StorageKind StorageKind;
|
||||
typedef typename internal::traits<Self>::Index Index;
|
||||
typedef Scalar_ Scalar;
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename Base::CoeffReturnType CoeffReturnType;
|
||||
|
||||
static const int Options = Options_;
|
||||
|
||||
enum {
|
||||
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
|
||||
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
|
||||
CoordAccess = true,
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
typedef Dimensions_ Dimensions;
|
||||
static const std::size_t NumIndices = Dimensions::count;
|
||||
|
||||
protected:
|
||||
TensorStorage<Scalar, Dimensions, Options> m_storage;
|
||||
|
||||
public:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); }
|
||||
|
||||
// This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
|
||||
// work, because that uses base().coeffRef() - and we don't yet
|
||||
// implement a similar class hierarchy
|
||||
inline Self& base() { return *this; }
|
||||
inline const Self& base() const { return *this; }
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... IndexTypes>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
|
||||
{
|
||||
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
|
||||
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}});
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
|
||||
{
|
||||
eigen_internal_assert(checkIndexRange(indices));
|
||||
return m_storage.data()[linearizedIndex(indices)];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
|
||||
{
|
||||
eigen_internal_assert(index >= 0 && index < size());
|
||||
return m_storage.data()[index];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& coeff() const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return m_storage.data()[0];
|
||||
}
|
||||
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... IndexTypes>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
|
||||
{
|
||||
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
|
||||
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}});
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
|
||||
{
|
||||
eigen_internal_assert(checkIndexRange(indices));
|
||||
return m_storage.data()[linearizedIndex(indices)];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
|
||||
{
|
||||
eigen_internal_assert(index >= 0 && index < size());
|
||||
return m_storage.data()[index];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& coeffRef()
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return m_storage.data()[0];
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... IndexTypes>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
|
||||
{
|
||||
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
|
||||
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
|
||||
}
|
||||
#else
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
|
||||
{
|
||||
if (Options&RowMajor) {
|
||||
const Index index = i1 + i0 * m_storage.dimensions()[1];
|
||||
return m_storage.data()[index];
|
||||
} else {
|
||||
const Index index = i0 + i1 * m_storage.dimensions()[0];
|
||||
return m_storage.data()[index];
|
||||
}
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
|
||||
{
|
||||
if (Options&RowMajor) {
|
||||
const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0);
|
||||
return m_storage.data()[index];
|
||||
} else {
|
||||
const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2);
|
||||
return m_storage.data()[index];
|
||||
}
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
|
||||
{
|
||||
if (Options&RowMajor) {
|
||||
const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0));
|
||||
return m_storage.data()[index];
|
||||
} else {
|
||||
const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3));
|
||||
return m_storage.data()[index];
|
||||
}
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
|
||||
{
|
||||
if (Options&RowMajor) {
|
||||
const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)));
|
||||
return m_storage.data()[index];
|
||||
} else {
|
||||
const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4)));
|
||||
return m_storage.data()[index];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
|
||||
{
|
||||
eigen_assert(checkIndexRange(indices));
|
||||
return coeff(indices);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
|
||||
{
|
||||
eigen_internal_assert(index >= 0 && index < size());
|
||||
return coeff(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()() const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return coeff();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
|
||||
{
|
||||
// The bracket operator is only for vectors, use the parenthesis operator instead.
|
||||
EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return coeff(index);
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... IndexTypes>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
|
||||
{
|
||||
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
|
||||
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
|
||||
}
|
||||
#else
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
|
||||
{
|
||||
if (Options&RowMajor) {
|
||||
const Index index = i1 + i0 * m_storage.dimensions()[1];
|
||||
return m_storage.data()[index];
|
||||
} else {
|
||||
const Index index = i0 + i1 * m_storage.dimensions()[0];
|
||||
return m_storage.data()[index];
|
||||
}
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
|
||||
{
|
||||
if (Options&RowMajor) {
|
||||
const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0);
|
||||
return m_storage.data()[index];
|
||||
} else {
|
||||
const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2);
|
||||
return m_storage.data()[index];
|
||||
}
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
|
||||
{
|
||||
if (Options&RowMajor) {
|
||||
const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0));
|
||||
return m_storage.data()[index];
|
||||
} else {
|
||||
const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3));
|
||||
return m_storage.data()[index];
|
||||
}
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
|
||||
{
|
||||
if (Options&RowMajor) {
|
||||
const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)));
|
||||
return m_storage.data()[index];
|
||||
} else {
|
||||
const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4)));
|
||||
return m_storage.data()[index];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
|
||||
{
|
||||
eigen_assert(checkIndexRange(indices));
|
||||
return coeffRef(indices);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index index)
|
||||
{
|
||||
eigen_assert(index >= 0 && index < size());
|
||||
return coeffRef(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()()
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return coeffRef();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator[](Index index)
|
||||
{
|
||||
// The bracket operator is only for vectors, use the parenthesis operator instead
|
||||
EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
return coeffRef(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorFixedSize()
|
||||
: m_storage()
|
||||
{
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorFixedSize(const Self& other)
|
||||
: m_storage(other.m_storage)
|
||||
{
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_RVALUE_REFERENCES
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other)
|
||||
: m_storage(other.m_storage)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, ReadOnlyAccessors>& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign;
|
||||
Assign assign(*this, other.derived());
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
}
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, WriteAccessors>& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign;
|
||||
Assign assign(*this, other.derived());
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorFixedSize& operator=(const TensorFixedSize& other)
|
||||
{
|
||||
// FIXME: check that the dimensions of other match the dimensions of *this.
|
||||
// Unfortunately this isn't possible yet when the rhs is an expression.
|
||||
typedef TensorAssignOp<Self, const TensorFixedSize> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other)
|
||||
{
|
||||
// FIXME: check that the dimensions of other match the dimensions of *this.
|
||||
// Unfortunately this isn't possible yet when the rhs is an expression.
|
||||
typedef TensorAssignOp<Self, const OtherDerived> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE bool checkIndexRange(const array<Index, NumIndices>& /*indices*/) const
|
||||
{
|
||||
using internal::array_apply_and_reduce;
|
||||
using internal::array_zip_and_reduce;
|
||||
using internal::greater_equal_zero_op;
|
||||
using internal::logical_and_op;
|
||||
using internal::lesser_op;
|
||||
|
||||
return true;
|
||||
// check whether the indices are all >= 0
|
||||
/* array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
|
||||
// check whether the indices fit in the dimensions
|
||||
array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());*/
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
|
||||
{
|
||||
if (Options&RowMajor) {
|
||||
return m_storage.dimensions().IndexOfRowMajor(indices);
|
||||
} else {
|
||||
return m_storage.dimensions().IndexOfColMajor(indices);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
|
|
@ -0,0 +1,169 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
template<typename XprType, template <class> class MakePointer_>
|
||||
struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
|
||||
{
|
||||
// Type promotion to handle the case where the types of the lhs and the rhs are different.
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename traits<XprType>::StorageKind StorageKind;
|
||||
typedef typename traits<XprType>::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
|
||||
enum {
|
||||
Flags = 0
|
||||
};
|
||||
template <class T> struct MakePointer {
|
||||
// Intermediate typedef to workaround MSVC issue.
|
||||
typedef MakePointer_<T> MakePointerT;
|
||||
typedef typename MakePointerT::Type Type;
|
||||
};
|
||||
};
|
||||
|
||||
template<typename XprType, template <class> class MakePointer_>
|
||||
struct eval<TensorForcedEvalOp<XprType, MakePointer_>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorForcedEvalOp<XprType, MakePointer_>& type;
|
||||
};
|
||||
|
||||
template<typename XprType, template <class> class MakePointer_>
|
||||
struct nested<TensorForcedEvalOp<XprType, MakePointer_>, 1, typename eval<TensorForcedEvalOp<XprType, MakePointer_> >::type>
|
||||
{
|
||||
typedef TensorForcedEvalOp<XprType, MakePointer_> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_)
|
||||
|
||||
/** \class TensorForcedEvalOp
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor reshaping class.
|
||||
*
|
||||
*
|
||||
*/
|
||||
/// `template <class> class MakePointer_` is added to convert the host pointer to the device pointer.
|
||||
/// It is added due to the fact that for our device compiler `T*` is not allowed.
|
||||
/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`.
|
||||
/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_<T>` is `T*` .
|
||||
/// Therefore, by adding the default value, we managed to convert the type and it does not break any
|
||||
/// existing code as its default value is `T*`.
|
||||
template<typename XprType, template <class> class MakePointer_>
|
||||
class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePointer_>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorForcedEvalOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorForcedEvalOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr)
|
||||
: m_xpr(expr) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
};
|
||||
|
||||
|
||||
template<typename ArgType, typename Device, template <class> class MakePointer_>
|
||||
struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
|
||||
{
|
||||
typedef TensorForcedEvalOp<ArgType, MakePointer_> XprType;
|
||||
typedef typename ArgType::Scalar Scalar;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = true,
|
||||
PacketAccess = (PacketSize > 1),
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
|
||||
/// op_ is used for sycl
|
||||
: m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
|
||||
{ }
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
|
||||
const Index numValues = internal::array_prod(m_impl.dimensions());
|
||||
m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
|
||||
// Should initialize the memory in case we're dealing with non POD types.
|
||||
if (NumTraits<CoeffReturnType>::RequireInitialization) {
|
||||
for (Index i = 0; i < numValues; ++i) {
|
||||
new(m_buffer+i) CoeffReturnType();
|
||||
}
|
||||
}
|
||||
typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
|
||||
EvalTo evalToTmp(m_buffer, m_op);
|
||||
const bool PacketAccess = internal::IsVectorizable<Device, const ArgType>::value;
|
||||
internal::TensorExecutor<const EvalTo, typename internal::remove_const<Device>::type, PacketAccess>::run(evalToTmp, m_device);
|
||||
return true;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_device.deallocate(m_buffer);
|
||||
m_buffer = NULL;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_buffer[index];
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC typename MakePointer<Scalar>::Type data() const { return m_buffer; }
|
||||
|
||||
/// required by sycl in order to extract the sycl accessor
|
||||
const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
|
||||
/// used by sycl in order to build the sycl buffer
|
||||
const Device& device() const{return m_device;}
|
||||
private:
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
const ArgType m_op;
|
||||
const Device& m_device;
|
||||
typename MakePointer<CoeffReturnType>::Type m_buffer;
|
||||
};
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
|
|
@ -0,0 +1,109 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// MakePointer class is used as a container of the adress space of the pointer
|
||||
// on the host and on the device. From the host side it generates the T* pointer
|
||||
// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to
|
||||
// T* m_data on the host. It is always called on the device.
|
||||
// Specialisation of MakePointer class for creating the sycl buffer with
|
||||
// map_allocator.
|
||||
template<typename T> struct MakePointer {
|
||||
typedef T* Type;
|
||||
};
|
||||
|
||||
template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
|
||||
template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
|
||||
template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
|
||||
template<typename PlainObjectType> class TensorRef;
|
||||
template<typename Derived, int AccessLevel> class TensorBase;
|
||||
|
||||
template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
|
||||
template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
|
||||
template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
|
||||
template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> class TensorCwiseTernaryOp;
|
||||
template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
|
||||
template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_ = MakePointer > class TensorReductionOp;
|
||||
template<typename XprType> class TensorIndexTupleOp;
|
||||
template<typename ReduceOp, typename Dims, typename XprType> class TensorTupleReducerOp;
|
||||
template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
|
||||
template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp;
|
||||
template<typename TargetType, typename XprType> class TensorConversionOp;
|
||||
template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
|
||||
template<typename FFT, typename XprType, int FFTDataType, int FFTDirection> class TensorFFTOp;
|
||||
template<typename PatchDim, typename XprType> class TensorPatchOp;
|
||||
template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp;
|
||||
template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorVolumePatchOp;
|
||||
template<typename Broadcast, typename XprType> class TensorBroadcastingOp;
|
||||
template<DenseIndex DimId, typename XprType> class TensorChippingOp;
|
||||
template<typename NewDimensions, typename XprType> class TensorReshapingOp;
|
||||
template<typename XprType> class TensorLayoutSwapOp;
|
||||
template<typename StartIndices, typename Sizes, typename XprType> class TensorSlicingOp;
|
||||
template<typename ReverseDimensions, typename XprType> class TensorReverseOp;
|
||||
template<typename PaddingDimensions, typename XprType> class TensorPaddingOp;
|
||||
template<typename Shuffle, typename XprType> class TensorShufflingOp;
|
||||
template<typename Strides, typename XprType> class TensorStridingOp;
|
||||
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> class TensorStridingSlicingOp;
|
||||
template<typename Strides, typename XprType> class TensorInflationOp;
|
||||
template<typename Generator, typename XprType> class TensorGeneratorOp;
|
||||
template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
|
||||
template<typename Op, typename XprType> class TensorScanOp;
|
||||
|
||||
template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
|
||||
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
|
||||
|
||||
template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp;
|
||||
template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorForcedEvalOp;
|
||||
|
||||
template<typename ExpressionType, typename DeviceType> class TensorDevice;
|
||||
template<typename Derived, typename Device> struct TensorEvaluator;
|
||||
|
||||
struct DefaultDevice;
|
||||
struct ThreadPoolDevice;
|
||||
struct GpuDevice;
|
||||
struct SyclDevice;
|
||||
|
||||
enum FFTResultType {
|
||||
RealPart = 0,
|
||||
ImagPart = 1,
|
||||
BothParts = 2
|
||||
};
|
||||
|
||||
enum FFTDirection {
|
||||
FFT_FORWARD = 0,
|
||||
FFT_REVERSE = 1
|
||||
};
|
||||
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <typename Device, typename Expression>
|
||||
struct IsVectorizable {
|
||||
static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
|
||||
};
|
||||
|
||||
template <typename Expression>
|
||||
struct IsVectorizable<GpuDevice, Expression> {
|
||||
static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess &&
|
||||
TensorEvaluator<Expression, GpuDevice>::IsAligned;
|
||||
};
|
||||
|
||||
template <typename Expression, typename Device,
|
||||
bool Vectorizable = IsVectorizable<Device, Expression>::value>
|
||||
class TensorExecutor;
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
|
|
@ -0,0 +1,489 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
|
||||
/** \internal
|
||||
* \brief Template functor to compute the modulo between an array and a scalar.
|
||||
*/
|
||||
template <typename Scalar>
|
||||
struct scalar_mod_op {
|
||||
EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
|
||||
EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; }
|
||||
const Scalar m_divisor;
|
||||
};
|
||||
template <typename Scalar>
|
||||
struct functor_traits<scalar_mod_op<Scalar> >
|
||||
{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
|
||||
|
||||
|
||||
/** \internal
|
||||
* \brief Template functor to compute the modulo between 2 arrays.
|
||||
*/
|
||||
template <typename Scalar>
|
||||
struct scalar_mod2_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
|
||||
EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
|
||||
};
|
||||
template <typename Scalar>
|
||||
struct functor_traits<scalar_mod2_op<Scalar> >
|
||||
{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
|
||||
|
||||
template <typename Scalar>
|
||||
struct scalar_fmod_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op);
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
|
||||
operator()(const Scalar& a, const Scalar& b) const {
|
||||
return numext::fmod(a, b);
|
||||
}
|
||||
};
|
||||
template <typename Scalar>
|
||||
struct functor_traits<scalar_fmod_op<Scalar> > {
|
||||
enum { Cost = 13, // Reciprocal throughput of FPREM on Haswell.
|
||||
PacketAccess = false };
|
||||
};
|
||||
|
||||
|
||||
/** \internal
|
||||
* \brief Template functor to compute the sigmoid of a scalar
|
||||
* \sa class CwiseUnaryOp, ArrayBase::sigmoid()
|
||||
*/
|
||||
template <typename T>
|
||||
struct scalar_sigmoid_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
|
||||
const T one = T(1);
|
||||
return one / (one + numext::exp(-x));
|
||||
}
|
||||
|
||||
template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Packet packetOp(const Packet& x) const {
|
||||
const Packet one = pset1<Packet>(T(1));
|
||||
return pdiv(one, padd(one, pexp(pnegate(x))));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct functor_traits<scalar_sigmoid_op<T> > {
|
||||
enum {
|
||||
Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 6,
|
||||
PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
|
||||
packet_traits<T>::HasNegate && packet_traits<T>::HasExp
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template<typename Reducer, typename Device>
|
||||
struct reducer_traits {
|
||||
enum {
|
||||
Cost = 1,
|
||||
PacketAccess = false
|
||||
};
|
||||
};
|
||||
|
||||
// Standard reduction functors
|
||||
template <typename T> struct SumReducer
|
||||
{
|
||||
static const bool PacketAccess = packet_traits<T>::HasAdd;
|
||||
static const bool IsStateful = false;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
|
||||
internal::scalar_sum_op<T> sum_op;
|
||||
*accum = sum_op(*accum, t);
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
|
||||
(*accum) = padd<Packet>(*accum, p);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
|
||||
internal::scalar_cast_op<int, T> conv;
|
||||
return conv(0);
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
|
||||
return pset1<Packet>(initialize());
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
|
||||
return accum;
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
|
||||
return vaccum;
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
|
||||
internal::scalar_sum_op<T> sum_op;
|
||||
return sum_op(saccum, predux(vaccum));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename Device>
|
||||
struct reducer_traits<SumReducer<T>, Device> {
|
||||
enum {
|
||||
Cost = NumTraits<T>::AddCost,
|
||||
PacketAccess = PacketType<T, Device>::HasAdd
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template <typename T> struct MeanReducer
|
||||
{
|
||||
static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
|
||||
static const bool IsStateful = true;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
MeanReducer() : scalarCount_(0), packetCount_(0) { }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
|
||||
internal::scalar_sum_op<T> sum_op;
|
||||
*accum = sum_op(*accum, t);
|
||||
scalarCount_++;
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) {
|
||||
(*accum) = padd<Packet>(*accum, p);
|
||||
packetCount_++;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
|
||||
internal::scalar_cast_op<int, T> conv;
|
||||
return conv(0);
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
|
||||
return pset1<Packet>(initialize());
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
|
||||
return accum / scalarCount_;
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
|
||||
return pdiv(vaccum, pset1<Packet>(packetCount_));
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
|
||||
internal::scalar_sum_op<T> sum_op;
|
||||
return sum_op(saccum, predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits<Packet>::size);
|
||||
}
|
||||
|
||||
protected:
|
||||
DenseIndex scalarCount_;
|
||||
DenseIndex packetCount_;
|
||||
};
|
||||
|
||||
template <typename T, typename Device>
|
||||
struct reducer_traits<MeanReducer<T>, Device> {
|
||||
enum {
|
||||
Cost = NumTraits<T>::AddCost,
|
||||
PacketAccess = PacketType<T, Device>::HasAdd
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template <typename T, bool IsMax = true, bool IsInteger = true>
|
||||
struct MinMaxBottomValue {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
|
||||
return Eigen::NumTraits<T>::lowest();
|
||||
}
|
||||
};
|
||||
template <typename T>
|
||||
struct MinMaxBottomValue<T, true, false> {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
|
||||
return -Eigen::NumTraits<T>::infinity();
|
||||
}
|
||||
};
|
||||
template <typename T>
|
||||
struct MinMaxBottomValue<T, false, true> {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
|
||||
return Eigen::NumTraits<T>::highest();
|
||||
}
|
||||
};
|
||||
template <typename T>
|
||||
struct MinMaxBottomValue<T, false, false> {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
|
||||
return Eigen::NumTraits<T>::infinity();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename T> struct MaxReducer
|
||||
{
|
||||
static const bool PacketAccess = packet_traits<T>::HasMax;
|
||||
static const bool IsStateful = false;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
|
||||
if (t > *accum) { *accum = t; }
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
|
||||
(*accum) = pmax<Packet>(*accum, p);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
|
||||
return MinMaxBottomValue<T, true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
|
||||
return pset1<Packet>(initialize());
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
|
||||
return accum;
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
|
||||
return vaccum;
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
|
||||
return numext::maxi(saccum, predux_max(vaccum));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename Device>
|
||||
struct reducer_traits<MaxReducer<T>, Device> {
|
||||
enum {
|
||||
Cost = NumTraits<T>::AddCost,
|
||||
PacketAccess = PacketType<T, Device>::HasMax
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template <typename T> struct MinReducer
|
||||
{
|
||||
static const bool PacketAccess = packet_traits<T>::HasMin;
|
||||
static const bool IsStateful = false;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
|
||||
if (t < *accum) { *accum = t; }
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
|
||||
(*accum) = pmin<Packet>(*accum, p);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
|
||||
return MinMaxBottomValue<T, false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
|
||||
return pset1<Packet>(initialize());
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
|
||||
return accum;
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
|
||||
return vaccum;
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
|
||||
return numext::mini(saccum, predux_min(vaccum));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename Device>
|
||||
struct reducer_traits<MinReducer<T>, Device> {
|
||||
enum {
|
||||
Cost = NumTraits<T>::AddCost,
|
||||
PacketAccess = PacketType<T, Device>::HasMin
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template <typename T> struct ProdReducer
|
||||
{
|
||||
static const bool PacketAccess = packet_traits<T>::HasMul;
|
||||
static const bool IsStateful = false;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
|
||||
internal::scalar_product_op<T> prod_op;
|
||||
(*accum) = prod_op(*accum, t);
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
|
||||
(*accum) = pmul<Packet>(*accum, p);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
|
||||
internal::scalar_cast_op<int, T> conv;
|
||||
return conv(1);
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
|
||||
return pset1<Packet>(initialize());
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
|
||||
return accum;
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
|
||||
return vaccum;
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
|
||||
internal::scalar_product_op<T> prod_op;
|
||||
return prod_op(saccum, predux_mul(vaccum));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename Device>
|
||||
struct reducer_traits<ProdReducer<T>, Device> {
|
||||
enum {
|
||||
Cost = NumTraits<T>::MulCost,
|
||||
PacketAccess = PacketType<T, Device>::HasMul
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
struct AndReducer
|
||||
{
|
||||
static const bool PacketAccess = false;
|
||||
static const bool IsStateful = false;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
|
||||
*accum = *accum && t;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
|
||||
return true;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
|
||||
return accum;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Device>
|
||||
struct reducer_traits<AndReducer, Device> {
|
||||
enum {
|
||||
Cost = 1,
|
||||
PacketAccess = false
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
struct OrReducer {
|
||||
static const bool PacketAccess = false;
|
||||
static const bool IsStateful = false;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
|
||||
*accum = *accum || t;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
|
||||
return false;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
|
||||
return accum;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Device>
|
||||
struct reducer_traits<OrReducer, Device> {
|
||||
enum {
|
||||
Cost = 1,
|
||||
PacketAccess = false
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
// Argmin/Argmax reducers
|
||||
template <typename T> struct ArgMaxTupleReducer
|
||||
{
|
||||
static const bool PacketAccess = false;
|
||||
static const bool IsStateful = false;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
|
||||
if (t.second > accum->second) { *accum = t; }
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
|
||||
return T(0, NumTraits<typename T::second_type>::lowest());
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const {
|
||||
return accum;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename Device>
|
||||
struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
|
||||
enum {
|
||||
Cost = NumTraits<T>::AddCost,
|
||||
PacketAccess = false
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template <typename T> struct ArgMinTupleReducer
|
||||
{
|
||||
static const bool PacketAccess = false;
|
||||
static const bool IsStateful = false;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const {
|
||||
if (t.second < accum->second) { *accum = t; }
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
|
||||
return T(0, NumTraits<typename T::second_type>::highest());
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const {
|
||||
return accum;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename Device>
|
||||
struct reducer_traits<ArgMinTupleReducer<T>, Device> {
|
||||
enum {
|
||||
Cost = NumTraits<T>::AddCost,
|
||||
PacketAccess = false
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template <typename T, typename Index, size_t NumDims>
|
||||
class GaussianGenerator {
|
||||
public:
|
||||
static const bool PacketAccess = false;
|
||||
|
||||
EIGEN_DEVICE_FUNC GaussianGenerator(const array<T, NumDims>& means,
|
||||
const array<T, NumDims>& std_devs)
|
||||
: m_means(means)
|
||||
{
|
||||
for (size_t i = 0; i < NumDims; ++i) {
|
||||
m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
|
||||
T tmp = T(0);
|
||||
for (size_t i = 0; i < NumDims; ++i) {
|
||||
T offset = coordinates[i] - m_means[i];
|
||||
tmp += offset * offset / m_two_sigmas[i];
|
||||
}
|
||||
return numext::exp(-tmp);
|
||||
}
|
||||
|
||||
private:
|
||||
array<T, NumDims> m_means;
|
||||
array<T, NumDims> m_two_sigmas;
|
||||
};
|
||||
|
||||
template <typename T, typename Index, size_t NumDims>
|
||||
struct functor_traits<GaussianGenerator<T, Index, NumDims> > {
|
||||
enum {
|
||||
Cost = NumDims * (2 * NumTraits<T>::AddCost + NumTraits<T>::MulCost +
|
||||
functor_traits<scalar_quotient_op<T, T> >::Cost) +
|
||||
functor_traits<scalar_exp_op<T> >::Cost,
|
||||
PacketAccess = GaussianGenerator<T, Index, NumDims>::PacketAccess
|
||||
};
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
|
|
@ -0,0 +1,185 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorGeneratorOp
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor generator class.
|
||||
*
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename Generator, typename XprType>
|
||||
struct traits<TensorGeneratorOp<Generator, XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<typename Generator, typename XprType>
|
||||
struct eval<TensorGeneratorOp<Generator, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorGeneratorOp<Generator, XprType>& type;
|
||||
};
|
||||
|
||||
template<typename Generator, typename XprType>
|
||||
struct nested<TensorGeneratorOp<Generator, XprType>, 1, typename eval<TensorGeneratorOp<Generator, XprType> >::type>
|
||||
{
|
||||
typedef TensorGeneratorOp<Generator, XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename Generator, typename XprType>
|
||||
class TensorGeneratorOp : public TensorBase<TensorGeneratorOp<Generator, XprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorGeneratorOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorGeneratorOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorGeneratorOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorGeneratorOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator)
|
||||
: m_xpr(expr), m_generator(generator) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const Generator& generator() const { return m_generator; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const Generator m_generator;
|
||||
};
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename Generator, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
|
||||
{
|
||||
typedef TensorGeneratorOp<Generator, ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
|
||||
static const int NumDims = internal::array_size<Dimensions>::value;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_generator(op.generator())
|
||||
{
|
||||
TensorEvaluator<ArgType, Device> impl(op.expression(), device);
|
||||
m_dimensions = impl.dimensions();
|
||||
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_strides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
|
||||
}
|
||||
} else {
|
||||
m_strides[NumDims - 1] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
return true;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
array<Index, NumDims> coords;
|
||||
extract_coordinates(index, coords);
|
||||
return m_generator(coords);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool) const {
|
||||
// TODO(rmlarsen): This is just a placeholder. Define interface to make
|
||||
// generators return their cost.
|
||||
return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() +
|
||||
TensorOpCost::MulCost<Scalar>());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx = index / m_strides[i];
|
||||
index -= idx * m_strides[i];
|
||||
coords[i] = idx;
|
||||
}
|
||||
coords[0] = index;
|
||||
} else {
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const Index idx = index / m_strides[i];
|
||||
index -= idx * m_strides[i];
|
||||
coords[i] = idx;
|
||||
}
|
||||
coords[NumDims-1] = index;
|
||||
}
|
||||
}
|
||||
|
||||
Dimensions m_dimensions;
|
||||
array<Index, NumDims> m_strides;
|
||||
Generator m_generator;
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
|
|
@ -0,0 +1,33 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors.
|
||||
*
|
||||
* This function computes the regularized incomplete beta function (integral).
|
||||
*
|
||||
*/
|
||||
template <typename ADerived, typename BDerived, typename XDerived>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
|
||||
TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>,
|
||||
const ADerived, const BDerived, const XDerived>
|
||||
betainc(const ADerived& a, const BDerived& b, const XDerived& x) {
|
||||
return TensorCwiseTernaryOp<
|
||||
internal::scalar_betainc_op<typename XDerived::Scalar>, const ADerived,
|
||||
const BDerived, const XDerived>(
|
||||
a, b, x, internal::scalar_betainc_op<typename XDerived::Scalar>());
|
||||
}
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
|
|
@ -0,0 +1,79 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_IO_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
// Print the tensor as a 2d matrix
|
||||
template <typename Tensor, int Rank>
|
||||
struct TensorPrinter {
|
||||
static void run (std::ostream& os, const Tensor& tensor) {
|
||||
typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
|
||||
typedef typename Tensor::Index Index;
|
||||
const Index total_size = internal::array_prod(tensor.dimensions());
|
||||
if (total_size > 0) {
|
||||
const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
|
||||
static const int layout = Tensor::Layout;
|
||||
Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
|
||||
os << matrix;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Print the tensor as a vector
|
||||
template <typename Tensor>
|
||||
struct TensorPrinter<Tensor, 1> {
|
||||
static void run (std::ostream& os, const Tensor& tensor) {
|
||||
typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
|
||||
typedef typename Tensor::Index Index;
|
||||
const Index total_size = internal::array_prod(tensor.dimensions());
|
||||
if (total_size > 0) {
|
||||
Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
|
||||
os << array;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Print the tensor as a scalar
|
||||
template <typename Tensor>
|
||||
struct TensorPrinter<Tensor, 0> {
|
||||
static void run (std::ostream& os, const Tensor& tensor) {
|
||||
os << tensor.coeff(0);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
|
||||
typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
|
||||
typedef typename Evaluator::Dimensions Dimensions;
|
||||
|
||||
// Evaluate the expression if needed
|
||||
TensorForcedEvalOp<const T> eval = expr.eval();
|
||||
Evaluator tensor(eval, DefaultDevice());
|
||||
tensor.evalSubExprsIfNeeded(NULL);
|
||||
|
||||
// Print the result
|
||||
static const int rank = internal::array_size<Dimensions>::value;
|
||||
internal::TensorPrinter<Evaluator, rank>::run(os, tensor);
|
||||
|
||||
// Cleanup.
|
||||
tensor.cleanup();
|
||||
return os;
|
||||
}
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H
|
|
@ -0,0 +1,509 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorImagePatch
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Patch extraction specialized for image processing.
|
||||
* This assumes that the input has a least 3 dimensions ordered as follow:
|
||||
* 1st dimension: channels (of size d)
|
||||
* 2nd dimension: rows (of size r)
|
||||
* 3rd dimension: columns (of size c)
|
||||
* There can be additional dimensions such as time (for video) or batch (for
|
||||
* bulk processing after the first 3.
|
||||
* Calling the image patch code with patch_rows and patch_cols is equivalent
|
||||
* to calling the regular patch extraction code with parameters d, patch_rows,
|
||||
* patch_cols, and 1 for all the additional dimensions.
|
||||
*/
|
||||
namespace internal {
|
||||
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
|
||||
struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions + 1;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
|
||||
struct eval<TensorImagePatchOp<Rows, Cols, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorImagePatchOp<Rows, Cols, XprType>& type;
|
||||
};
|
||||
|
||||
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
|
||||
struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorImagePatchOp<Rows, Cols, XprType> >::type>
|
||||
{
|
||||
typedef TensorImagePatchOp<Rows, Cols, XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
|
||||
class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorImagePatchOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorImagePatchOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorImagePatchOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorImagePatchOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
|
||||
DenseIndex row_strides, DenseIndex col_strides,
|
||||
DenseIndex in_row_strides, DenseIndex in_col_strides,
|
||||
DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
|
||||
PaddingType padding_type, Scalar padding_value)
|
||||
: m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
|
||||
m_row_strides(row_strides), m_col_strides(col_strides),
|
||||
m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
|
||||
m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
|
||||
m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
|
||||
m_padding_type(padding_type), m_padding_value(padding_value) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
|
||||
DenseIndex row_strides, DenseIndex col_strides,
|
||||
DenseIndex in_row_strides, DenseIndex in_col_strides,
|
||||
DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
|
||||
DenseIndex padding_top, DenseIndex padding_bottom,
|
||||
DenseIndex padding_left, DenseIndex padding_right,
|
||||
Scalar padding_value)
|
||||
: m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
|
||||
m_row_strides(row_strides), m_col_strides(col_strides),
|
||||
m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
|
||||
m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
|
||||
m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
|
||||
m_padding_left(padding_left), m_padding_right(padding_right),
|
||||
m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex patch_rows() const { return m_patch_rows; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex patch_cols() const { return m_patch_cols; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex row_strides() const { return m_row_strides; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex col_strides() const { return m_col_strides; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex in_row_strides() const { return m_in_row_strides; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex in_col_strides() const { return m_in_col_strides; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
bool padding_explicit() const { return m_padding_explicit; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex padding_top() const { return m_padding_top; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex padding_bottom() const { return m_padding_bottom; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex padding_left() const { return m_padding_left; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex padding_right() const { return m_padding_right; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
PaddingType padding_type() const { return m_padding_type; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
Scalar padding_value() const { return m_padding_value; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const DenseIndex m_patch_rows;
|
||||
const DenseIndex m_patch_cols;
|
||||
const DenseIndex m_row_strides;
|
||||
const DenseIndex m_col_strides;
|
||||
const DenseIndex m_in_row_strides;
|
||||
const DenseIndex m_in_col_strides;
|
||||
const DenseIndex m_row_inflate_strides;
|
||||
const DenseIndex m_col_inflate_strides;
|
||||
const bool m_padding_explicit;
|
||||
const DenseIndex m_padding_top;
|
||||
const DenseIndex m_padding_bottom;
|
||||
const DenseIndex m_padding_left;
|
||||
const DenseIndex m_padding_right;
|
||||
const PaddingType m_padding_type;
|
||||
const Scalar m_padding_value;
|
||||
};
|
||||
|
||||
// Eval as rvalue
|
||||
template<DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
||||
{
|
||||
typedef TensorImagePatchOp<Rows, Cols, ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
static const int NumDims = NumInputDims + 1;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
|
||||
typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>,
|
||||
Device> Self;
|
||||
typedef TensorEvaluator<ArgType, Device> Impl;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
|
||||
m_paddingValue = op.padding_value();
|
||||
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
|
||||
// Caches a few variables.
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_inputDepth = input_dims[0];
|
||||
m_inputRows = input_dims[1];
|
||||
m_inputCols = input_dims[2];
|
||||
} else {
|
||||
m_inputDepth = input_dims[NumInputDims-1];
|
||||
m_inputRows = input_dims[NumInputDims-2];
|
||||
m_inputCols = input_dims[NumInputDims-3];
|
||||
}
|
||||
|
||||
m_row_strides = op.row_strides();
|
||||
m_col_strides = op.col_strides();
|
||||
|
||||
// Input strides and effective input/patch size
|
||||
m_in_row_strides = op.in_row_strides();
|
||||
m_in_col_strides = op.in_col_strides();
|
||||
m_row_inflate_strides = op.row_inflate_strides();
|
||||
m_col_inflate_strides = op.col_inflate_strides();
|
||||
// The "effective" input rows and input cols are the input rows and cols
|
||||
// after inflating them with zeros.
|
||||
// For examples, a 2x3 matrix with row_inflate_strides and
|
||||
// col_inflate_strides of 2 comes from:
|
||||
// A B C
|
||||
// D E F
|
||||
//
|
||||
// to a matrix is 3 x 5:
|
||||
//
|
||||
// A . B . C
|
||||
// . . . . .
|
||||
// D . E . F
|
||||
|
||||
m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
|
||||
m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
|
||||
m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
|
||||
m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
|
||||
|
||||
if (op.padding_explicit()) {
|
||||
m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
|
||||
m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
|
||||
m_rowPaddingTop = op.padding_top();
|
||||
m_colPaddingLeft = op.padding_left();
|
||||
} else {
|
||||
// Computing padding from the type
|
||||
switch (op.padding_type()) {
|
||||
case PADDING_VALID:
|
||||
m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
|
||||
m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
|
||||
// Calculate the padding
|
||||
m_rowPaddingTop = numext::maxi<Index>(0, ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2);
|
||||
m_colPaddingLeft = numext::maxi<Index>(0, ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2);
|
||||
break;
|
||||
case PADDING_SAME:
|
||||
m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
|
||||
m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
|
||||
// Calculate the padding
|
||||
m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2;
|
||||
m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2;
|
||||
break;
|
||||
default:
|
||||
eigen_assert(false && "unexpected padding");
|
||||
}
|
||||
}
|
||||
eigen_assert(m_outputRows > 0);
|
||||
eigen_assert(m_outputCols > 0);
|
||||
|
||||
// Dimensions for result of extraction.
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
// ColMajor
|
||||
// 0: depth
|
||||
// 1: patch_rows
|
||||
// 2: patch_cols
|
||||
// 3: number of patches
|
||||
// 4 and beyond: anything else (such as batch).
|
||||
m_dimensions[0] = input_dims[0];
|
||||
m_dimensions[1] = op.patch_rows();
|
||||
m_dimensions[2] = op.patch_cols();
|
||||
m_dimensions[3] = m_outputRows * m_outputCols;
|
||||
for (int i = 4; i < NumDims; ++i) {
|
||||
m_dimensions[i] = input_dims[i-1];
|
||||
}
|
||||
} else {
|
||||
// RowMajor
|
||||
// NumDims-1: depth
|
||||
// NumDims-2: patch_rows
|
||||
// NumDims-3: patch_cols
|
||||
// NumDims-4: number of patches
|
||||
// NumDims-5 and beyond: anything else (such as batch).
|
||||
m_dimensions[NumDims-1] = input_dims[NumInputDims-1];
|
||||
m_dimensions[NumDims-2] = op.patch_rows();
|
||||
m_dimensions[NumDims-3] = op.patch_cols();
|
||||
m_dimensions[NumDims-4] = m_outputRows * m_outputCols;
|
||||
for (int i = NumDims-5; i >= 0; --i) {
|
||||
m_dimensions[i] = input_dims[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Strides for moving the patch in various dimensions.
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_colStride = m_dimensions[1];
|
||||
m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0];
|
||||
m_otherStride = m_patchStride * m_dimensions[3];
|
||||
} else {
|
||||
m_colStride = m_dimensions[NumDims-2];
|
||||
m_patchStride = m_colStride * m_dimensions[NumDims-3] * m_dimensions[NumDims-1];
|
||||
m_otherStride = m_patchStride * m_dimensions[NumDims-4];
|
||||
}
|
||||
|
||||
// Strides for navigating through the input tensor.
|
||||
m_rowInputStride = m_inputDepth;
|
||||
m_colInputStride = m_inputDepth * m_inputRows;
|
||||
m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols;
|
||||
|
||||
// Fast representations of different variables.
|
||||
m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
|
||||
m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
|
||||
m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
|
||||
m_fastInflateRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
|
||||
m_fastInflateColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
|
||||
m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
|
||||
|
||||
// Number of patches in the width dimension.
|
||||
m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
|
||||
} else {
|
||||
m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
// Patch index corresponding to the passed in index.
|
||||
const Index patchIndex = index / m_fastPatchStride;
|
||||
// Find the offset of the element wrt the location of the first element.
|
||||
const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
|
||||
|
||||
// Other ways to index this element.
|
||||
const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride;
|
||||
const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
|
||||
|
||||
// Calculate col index in the input original tensor.
|
||||
const Index colIndex = patch2DIndex / m_fastOutputRows;
|
||||
const Index colOffset = patchOffset / m_fastColStride;
|
||||
const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
|
||||
const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0);
|
||||
if (inputCol < 0 || inputCol >= m_input_cols_eff ||
|
||||
((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
|
||||
return Scalar(m_paddingValue);
|
||||
}
|
||||
|
||||
// Calculate row index in the original input tensor.
|
||||
const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
|
||||
const Index rowOffset = patchOffset - colOffset * m_colStride;
|
||||
const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
|
||||
const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0);
|
||||
if (inputRow < 0 || inputRow >= m_input_rows_eff ||
|
||||
((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
|
||||
return Scalar(m_paddingValue);
|
||||
}
|
||||
|
||||
const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
|
||||
const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
|
||||
|
||||
const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride;
|
||||
return m_impl.coeff(inputIndex);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
|
||||
return packetWithPossibleZero(index);
|
||||
}
|
||||
|
||||
const Index indices[2] = {index, index + PacketSize - 1};
|
||||
const Index patchIndex = indices[0] / m_fastPatchStride;
|
||||
if (patchIndex != indices[1] / m_fastPatchStride) {
|
||||
return packetWithPossibleZero(index);
|
||||
}
|
||||
const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride;
|
||||
eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
|
||||
|
||||
// Find the offset of the element wrt the location of the first element.
|
||||
const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
|
||||
(indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
|
||||
|
||||
const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
|
||||
eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
|
||||
|
||||
const Index colIndex = patch2DIndex / m_fastOutputRows;
|
||||
const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
|
||||
|
||||
// Calculate col indices in the original input tensor.
|
||||
const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] -
|
||||
m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
|
||||
if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
|
||||
return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
|
||||
}
|
||||
|
||||
if (inputCols[0] == inputCols[1]) {
|
||||
const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
|
||||
const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride};
|
||||
eigen_assert(rowOffsets[0] <= rowOffsets[1]);
|
||||
// Calculate col indices in the original input tensor.
|
||||
const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] -
|
||||
m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
|
||||
|
||||
if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
|
||||
return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
|
||||
}
|
||||
|
||||
if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
|
||||
// no padding
|
||||
const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
|
||||
const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
|
||||
const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride;
|
||||
return m_impl.template packet<Unaligned>(inputIndex);
|
||||
}
|
||||
}
|
||||
|
||||
return packetWithPossibleZero(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
|
||||
Index rowPaddingTop() const { return m_rowPaddingTop; }
|
||||
Index colPaddingLeft() const { return m_colPaddingLeft; }
|
||||
Index outputRows() const { return m_outputRows; }
|
||||
Index outputCols() const { return m_outputCols; }
|
||||
Index userRowStride() const { return m_row_strides; }
|
||||
Index userColStride() const { return m_col_strides; }
|
||||
Index userInRowStride() const { return m_in_row_strides; }
|
||||
Index userInColStride() const { return m_in_col_strides; }
|
||||
Index rowInflateStride() const { return m_row_inflate_strides; }
|
||||
Index colInflateStride() const { return m_col_inflate_strides; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
// We conservatively estimate the cost for the code path where the computed
|
||||
// index is inside the original image and
|
||||
// TensorEvaluator<ArgType, Device>::CoordAccess is false.
|
||||
const double compute_cost = 3 * TensorOpCost::DivCost<Index>() +
|
||||
6 * TensorOpCost::MulCost<Index>() +
|
||||
8 * TensorOpCost::MulCost<Index>();
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
||||
{
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
|
||||
Dimensions m_dimensions;
|
||||
|
||||
Index m_otherStride;
|
||||
Index m_patchStride;
|
||||
Index m_colStride;
|
||||
Index m_row_strides;
|
||||
Index m_col_strides;
|
||||
|
||||
Index m_in_row_strides;
|
||||
Index m_in_col_strides;
|
||||
Index m_row_inflate_strides;
|
||||
Index m_col_inflate_strides;
|
||||
|
||||
Index m_input_rows_eff;
|
||||
Index m_input_cols_eff;
|
||||
Index m_patch_rows_eff;
|
||||
Index m_patch_cols_eff;
|
||||
|
||||
internal::TensorIntDivisor<Index> m_fastOtherStride;
|
||||
internal::TensorIntDivisor<Index> m_fastPatchStride;
|
||||
internal::TensorIntDivisor<Index> m_fastColStride;
|
||||
internal::TensorIntDivisor<Index> m_fastInflateRowStride;
|
||||
internal::TensorIntDivisor<Index> m_fastInflateColStride;
|
||||
internal::TensorIntDivisor<Index> m_fastInputColsEff;
|
||||
|
||||
Index m_rowInputStride;
|
||||
Index m_colInputStride;
|
||||
Index m_patchInputStride;
|
||||
|
||||
Index m_inputDepth;
|
||||
Index m_inputRows;
|
||||
Index m_inputCols;
|
||||
|
||||
Index m_outputRows;
|
||||
Index m_outputCols;
|
||||
|
||||
Index m_rowPaddingTop;
|
||||
Index m_colPaddingLeft;
|
||||
|
||||
internal::TensorIntDivisor<Index> m_fastOutputRows;
|
||||
internal::TensorIntDivisor<Index> m_fastOutputDepth;
|
||||
|
||||
Scalar m_paddingValue;
|
||||
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
};
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
|
|
@ -0,0 +1,725 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
|
||||
|
||||
|
||||
#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
|
||||
#define EIGEN_HAS_INDEX_LIST
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class TensorIndexList
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Set of classes used to encode a set of Tensor dimensions/indices.
|
||||
*
|
||||
* The indices in the list can be known at compile time or at runtime. A mix
|
||||
* of static and dynamic indices can also be provided if needed. The tensor
|
||||
* code will attempt to take advantage of the indices that are known at
|
||||
* compile time to optimize the code it generates.
|
||||
*
|
||||
* This functionality requires a c++11 compliant compiler. If your compiler
|
||||
* is older you need to use arrays of indices instead.
|
||||
*
|
||||
* Several examples are provided in the cxx11_tensor_index_list.cpp file.
|
||||
*
|
||||
* \sa Tensor
|
||||
*/
|
||||
|
||||
template <DenseIndex n>
|
||||
struct type2index {
|
||||
static const DenseIndex value = n;
|
||||
EIGEN_DEVICE_FUNC constexpr operator DenseIndex() const { return n; }
|
||||
EIGEN_DEVICE_FUNC void set(DenseIndex val) {
|
||||
eigen_assert(val == n);
|
||||
}
|
||||
};
|
||||
|
||||
// This can be used with IndexPairList to get compile-time constant pairs,
|
||||
// such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>().
|
||||
template <DenseIndex f, DenseIndex s>
|
||||
struct type2indexpair {
|
||||
static const DenseIndex first = f;
|
||||
static const DenseIndex second = s;
|
||||
|
||||
constexpr EIGEN_DEVICE_FUNC operator IndexPair<DenseIndex>() const {
|
||||
return IndexPair<DenseIndex>(f, s);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC void set(const IndexPair<DenseIndex>& val) {
|
||||
eigen_assert(val.first == f);
|
||||
eigen_assert(val.second == s);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<DenseIndex n> struct NumTraits<type2index<n> >
|
||||
{
|
||||
typedef DenseIndex Real;
|
||||
enum {
|
||||
IsComplex = 0,
|
||||
RequireInitialization = false,
|
||||
ReadCost = 1,
|
||||
AddCost = 1,
|
||||
MulCost = 1
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; }
|
||||
EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; }
|
||||
EIGEN_DEVICE_FUNC static inline Real highest() { return n; }
|
||||
EIGEN_DEVICE_FUNC static inline Real lowest() { return n; }
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) {
|
||||
val = new_val;
|
||||
}
|
||||
template <DenseIndex n>
|
||||
EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, DenseIndex new_val) {
|
||||
val.set(new_val);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<DenseIndex> new_val) {
|
||||
val = new_val;
|
||||
}
|
||||
template <DenseIndex f, DenseIndex s>
|
||||
EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<DenseIndex> new_val) {
|
||||
val.set(new_val);
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
struct is_compile_time_constant {
|
||||
static constexpr bool value = false;
|
||||
};
|
||||
|
||||
template <DenseIndex idx>
|
||||
struct is_compile_time_constant<type2index<idx> > {
|
||||
static constexpr bool value = true;
|
||||
};
|
||||
template <DenseIndex idx>
|
||||
struct is_compile_time_constant<const type2index<idx> > {
|
||||
static constexpr bool value = true;
|
||||
};
|
||||
template <DenseIndex idx>
|
||||
struct is_compile_time_constant<type2index<idx>& > {
|
||||
static constexpr bool value = true;
|
||||
};
|
||||
template <DenseIndex idx>
|
||||
struct is_compile_time_constant<const type2index<idx>& > {
|
||||
static constexpr bool value = true;
|
||||
};
|
||||
|
||||
template <DenseIndex f, DenseIndex s>
|
||||
struct is_compile_time_constant<type2indexpair<f, s> > {
|
||||
static constexpr bool value = true;
|
||||
};
|
||||
template <DenseIndex f, DenseIndex s>
|
||||
struct is_compile_time_constant<const type2indexpair<f, s> > {
|
||||
static constexpr bool value = true;
|
||||
};
|
||||
template <DenseIndex f, DenseIndex s>
|
||||
struct is_compile_time_constant<type2indexpair<f, s>& > {
|
||||
static constexpr bool value = true;
|
||||
};
|
||||
template <DenseIndex f, DenseIndex s>
|
||||
struct is_compile_time_constant<const type2indexpair<f, s>& > {
|
||||
static constexpr bool value = true;
|
||||
};
|
||||
|
||||
|
||||
template<typename... T>
|
||||
struct IndexTuple;
|
||||
|
||||
template<typename T, typename... O>
|
||||
struct IndexTuple<T, O...> {
|
||||
EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() { }
|
||||
EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) { }
|
||||
|
||||
constexpr static int count = 1 + sizeof...(O);
|
||||
T head;
|
||||
IndexTuple<O...> others;
|
||||
typedef T Head;
|
||||
typedef IndexTuple<O...> Other;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct IndexTuple<T> {
|
||||
EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() { }
|
||||
EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) { }
|
||||
|
||||
constexpr static int count = 1;
|
||||
T head;
|
||||
typedef T Head;
|
||||
};
|
||||
|
||||
|
||||
template<int N, typename... T>
|
||||
struct IndexTupleExtractor;
|
||||
|
||||
template<int N, typename T, typename... O>
|
||||
struct IndexTupleExtractor<N, T, O...> {
|
||||
|
||||
typedef typename IndexTupleExtractor<N-1, O...>::ValType ValType;
|
||||
|
||||
EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) {
|
||||
return IndexTupleExtractor<N-1, O...>::get_val(val.others);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) {
|
||||
return IndexTupleExtractor<N-1, O...>::get_val(val.others);
|
||||
}
|
||||
template <typename V>
|
||||
EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) {
|
||||
IndexTupleExtractor<N-1, O...>::set_val(val.others, new_val);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template<typename T, typename... O>
|
||||
struct IndexTupleExtractor<0, T, O...> {
|
||||
|
||||
typedef T ValType;
|
||||
|
||||
EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) {
|
||||
return val.head;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) {
|
||||
return val.head;
|
||||
}
|
||||
template <typename V>
|
||||
EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) {
|
||||
val.head = new_val;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
template <int N, typename T, typename... O>
|
||||
EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor<N, T, O...>::ValType& array_get(IndexTuple<T, O...>& tuple) {
|
||||
return IndexTupleExtractor<N, T, O...>::get_val(tuple);
|
||||
}
|
||||
template <int N, typename T, typename... O>
|
||||
EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor<N, T, O...>::ValType& array_get(const IndexTuple<T, O...>& tuple) {
|
||||
return IndexTupleExtractor<N, T, O...>::get_val(tuple);
|
||||
}
|
||||
template <typename T, typename... O>
|
||||
struct array_size<IndexTuple<T, O...> > {
|
||||
static const size_t value = IndexTuple<T, O...>::count;
|
||||
};
|
||||
template <typename T, typename... O>
|
||||
struct array_size<const IndexTuple<T, O...> > {
|
||||
static const size_t value = IndexTuple<T, O...>::count;
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
template <DenseIndex Idx, typename ValueT>
|
||||
struct tuple_coeff {
|
||||
template <typename... T>
|
||||
EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple<T...>& t) {
|
||||
// return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
|
||||
return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx-1, ValueT>::get(i, t));
|
||||
}
|
||||
template <typename... T>
|
||||
EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT& value) {
|
||||
if (i == Idx) {
|
||||
update_value(array_get<Idx>(t), value);
|
||||
} else {
|
||||
tuple_coeff<Idx-1, ValueT>::set(i, t, value);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... T>
|
||||
EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>& t) {
|
||||
return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
|
||||
tuple_coeff<Idx-1, ValueT>::value_known_statically(i, t);
|
||||
}
|
||||
|
||||
template <typename... T>
|
||||
EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>& t) {
|
||||
return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
|
||||
tuple_coeff<Idx-1, ValueT>::values_up_to_known_statically(t);
|
||||
}
|
||||
|
||||
template <typename... T>
|
||||
EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>& t) {
|
||||
return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
|
||||
is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
|
||||
array_get<Idx>(t) > array_get<Idx-1>(t) &&
|
||||
tuple_coeff<Idx-1, ValueT>::values_up_to_statically_known_to_increase(t);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename ValueT>
|
||||
struct tuple_coeff<0, ValueT> {
|
||||
template <typename... T>
|
||||
EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple<T...>& t) {
|
||||
// eigen_assert (i == 0); // gcc fails to compile assertions in constexpr
|
||||
return array_get<0>(t)/* * (i == 0)*/;
|
||||
}
|
||||
template <typename... T>
|
||||
EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT value) {
|
||||
eigen_assert (i == 0);
|
||||
update_value(array_get<0>(t), value);
|
||||
}
|
||||
template <typename... T>
|
||||
EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>&) {
|
||||
return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value & (i == 0);
|
||||
}
|
||||
|
||||
template <typename... T>
|
||||
EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>&) {
|
||||
return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value;
|
||||
}
|
||||
|
||||
template <typename... T>
|
||||
EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>&) {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
} // namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename FirstType, typename... OtherTypes>
|
||||
struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const {
|
||||
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const {
|
||||
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) {
|
||||
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::set(i, *this, value);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
|
||||
EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple<FirstType, OtherTypes...>(first, other...) { }
|
||||
EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
|
||||
|
||||
EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
|
||||
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const {
|
||||
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_known_statically(*this);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const {
|
||||
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<typename FirstType, typename... OtherTypes>
|
||||
constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
|
||||
return IndexList<FirstType, OtherTypes...>(val1, other_vals...);
|
||||
}
|
||||
|
||||
|
||||
template<typename FirstType, typename... OtherTypes>
|
||||
struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<DenseIndex> operator[] (const DenseIndex i) const {
|
||||
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<DenseIndex>>::get(i, *this);
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair<DenseIndex> value) {
|
||||
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<DenseIndex> >::set(i, *this, value);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
|
||||
EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
|
||||
|
||||
EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
|
||||
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
|
||||
}
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
|
||||
size_t result = 1;
|
||||
for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
|
||||
result *= sizes[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename FirstType, typename... OtherTypes> struct array_size<IndexList<FirstType, OtherTypes...> > {
|
||||
static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
|
||||
};
|
||||
template<typename FirstType, typename... OtherTypes> struct array_size<const IndexList<FirstType, OtherTypes...> > {
|
||||
static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
|
||||
};
|
||||
|
||||
template<typename FirstType, typename... OtherTypes> struct array_size<IndexPairList<FirstType, OtherTypes...> > {
|
||||
static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
|
||||
};
|
||||
template<typename FirstType, typename... OtherTypes> struct array_size<const IndexPairList<FirstType, OtherTypes...> > {
|
||||
static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
|
||||
};
|
||||
|
||||
template<DenseIndex N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) {
|
||||
return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
|
||||
}
|
||||
template<DenseIndex N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(const IndexList<FirstType, OtherTypes...>& a) {
|
||||
return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct index_known_statically_impl {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct index_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) {
|
||||
return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct index_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) {
|
||||
return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
struct all_indices_known_statically_impl {
|
||||
static constexpr bool run() {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct all_indices_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run() {
|
||||
return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct all_indices_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run() {
|
||||
return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
struct indices_statically_known_to_increase_impl {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run() {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct indices_statically_known_to_increase_impl<IndexList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run() {
|
||||
return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct indices_statically_known_to_increase_impl<const IndexList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run() {
|
||||
return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename Tx>
|
||||
struct index_statically_eq_impl {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
|
||||
(IndexList<FirstType, OtherTypes...>().get(i) == value);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
|
||||
(IndexList<FirstType, OtherTypes...>().get(i) == value);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
struct index_statically_ne_impl {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
|
||||
(IndexList<FirstType, OtherTypes...>().get(i) != value);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
|
||||
(IndexList<FirstType, OtherTypes...>().get(i) != value);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
struct index_statically_gt_impl {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
|
||||
(IndexList<FirstType, OtherTypes...>().get(i) > value);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
|
||||
(IndexList<FirstType, OtherTypes...>().get(i) > value);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
template <typename T>
|
||||
struct index_statically_lt_impl {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
|
||||
(IndexList<FirstType, OtherTypes...>().get(i) < value);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
|
||||
(IndexList<FirstType, OtherTypes...>().get(i) < value);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
template <typename Tx>
|
||||
struct index_pair_first_statically_eq_impl {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
|
||||
(IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
|
||||
(IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
template <typename Tx>
|
||||
struct index_pair_second_statically_eq_impl {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
|
||||
(IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FirstType, typename... OtherTypes>
|
||||
struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
|
||||
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
|
||||
return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
|
||||
(IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
#else
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
template <typename T>
|
||||
struct index_known_statically_impl {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct all_indices_known_statically_impl {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct indices_statically_known_to_increase_impl {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct index_statically_eq_impl {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct index_statically_ne_impl {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct index_statically_gt_impl {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct index_statically_lt_impl {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Tx>
|
||||
struct index_pair_first_statically_eq_impl {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Tx>
|
||||
struct index_pair_second_statically_eq_impl {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
template <typename T>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(DenseIndex i) {
|
||||
return index_known_statically_impl<T>::run(i);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() {
|
||||
return all_indices_known_statically_impl<T>::run();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() {
|
||||
return indices_statically_known_to_increase_impl<T>::run();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(DenseIndex i, DenseIndex value) {
|
||||
return index_statically_eq_impl<T>::run(i, value);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(DenseIndex i, DenseIndex value) {
|
||||
return index_statically_ne_impl<T>::run(i, value);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(DenseIndex i, DenseIndex value) {
|
||||
return index_statically_gt_impl<T>::run(i, value);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i, DenseIndex value) {
|
||||
return index_statically_lt_impl<T>::run(i, value);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) {
|
||||
return index_pair_first_statically_eq_impl<T>::run(i, value);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) {
|
||||
return index_pair_second_statically_eq_impl<T>::run(i, value);
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
|
|
@ -0,0 +1,229 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Ke Yang <yangke@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorInflation
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor inflation class.
|
||||
*
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename Strides, typename XprType>
|
||||
struct traits<TensorInflationOp<Strides, XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<typename Strides, typename XprType>
|
||||
struct eval<TensorInflationOp<Strides, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorInflationOp<Strides, XprType>& type;
|
||||
};
|
||||
|
||||
template<typename Strides, typename XprType>
|
||||
struct nested<TensorInflationOp<Strides, XprType>, 1, typename eval<TensorInflationOp<Strides, XprType> >::type>
|
||||
{
|
||||
typedef TensorInflationOp<Strides, XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
template<typename Strides, typename XprType>
|
||||
class TensorInflationOp : public TensorBase<TensorInflationOp<Strides, XprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorInflationOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorInflationOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorInflationOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorInflationOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides)
|
||||
: m_xpr(expr), m_strides(strides) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const Strides& strides() const { return m_strides; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const Strides m_strides;
|
||||
};
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename Strides, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
|
||||
{
|
||||
typedef TensorInflationOp<Strides, ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_strides(op.strides())
|
||||
{
|
||||
m_dimensions = m_impl.dimensions();
|
||||
// Expand each dimension to the inflated dimension.
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1;
|
||||
}
|
||||
|
||||
// Remember the strides for fast division.
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
m_fastStrides[i] = internal::TensorIntDivisor<Index>(m_strides[i]);
|
||||
}
|
||||
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_outputStrides[0] = 1;
|
||||
m_inputStrides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
|
||||
m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
|
||||
}
|
||||
} else { // RowMajor
|
||||
m_outputStrides[NumDims-1] = 1;
|
||||
m_inputStrides[NumDims-1] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
|
||||
m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
// Computes the input index given the output index. Returns true if the output
|
||||
// index doesn't fall into a hole.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const
|
||||
{
|
||||
eigen_assert(index < dimensions().TotalSize());
|
||||
*inputIndex = 0;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx = index / m_outputStrides[i];
|
||||
if (idx != idx / m_fastStrides[i] * m_strides[i]) {
|
||||
return false;
|
||||
}
|
||||
*inputIndex += idx / m_strides[i] * m_inputStrides[i];
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
if (index != index / m_fastStrides[0] * m_strides[0]) {
|
||||
return false;
|
||||
}
|
||||
*inputIndex += index / m_strides[0];
|
||||
return true;
|
||||
} else {
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const Index idx = index / m_outputStrides[i];
|
||||
if (idx != idx / m_fastStrides[i] * m_strides[i]) {
|
||||
return false;
|
||||
}
|
||||
*inputIndex += idx / m_strides[i] * m_inputStrides[i];
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
if (index != index / m_fastStrides[NumDims-1] * m_strides[NumDims-1]) {
|
||||
return false;
|
||||
}
|
||||
*inputIndex += index / m_strides[NumDims - 1];
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
Index inputIndex = 0;
|
||||
if (getInputIndex(index, &inputIndex)) {
|
||||
return m_impl.coeff(inputIndex);
|
||||
} else {
|
||||
return Scalar(0);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(yangke): optimize this function so that we can detect and produce
|
||||
// all-zero packets
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost = NumDims * (3 * TensorOpCost::DivCost<Index>() +
|
||||
3 * TensorOpCost::MulCost<Index>() +
|
||||
2 * TensorOpCost::AddCost<Index>());
|
||||
const double input_size = m_impl.dimensions().TotalSize();
|
||||
const double output_size = m_dimensions.TotalSize();
|
||||
if (output_size == 0)
|
||||
return TensorOpCost();
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0,
|
||||
compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
Dimensions m_dimensions;
|
||||
array<Index, NumDims> m_outputStrides;
|
||||
array<Index, NumDims> m_inputStrides;
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
const Strides m_strides;
|
||||
array<internal::TensorIntDivisor<Index>, NumDims> m_fastStrides;
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
|
|
@ -0,0 +1,82 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
|
||||
#include <initializer_list>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorInitializer
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Helper template to initialize Tensors from std::initializer_lists.
|
||||
*/
|
||||
namespace internal {
|
||||
|
||||
template <typename Derived, int N>
|
||||
struct Initializer {
|
||||
typedef std::initializer_list<
|
||||
typename Initializer<Derived, N - 1>::InitList> InitList;
|
||||
|
||||
static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
|
||||
Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
|
||||
const InitList& vals) {
|
||||
int i = 0;
|
||||
for (auto v : vals) {
|
||||
(*indices)[traits<Derived>::NumDimensions - N] = i++;
|
||||
Initializer<Derived, N - 1>::run(tensor, indices, v);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Derived>
|
||||
struct Initializer<Derived, 1> {
|
||||
typedef std::initializer_list<typename traits<Derived>::Scalar> InitList;
|
||||
|
||||
static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
|
||||
Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
|
||||
const InitList& vals) {
|
||||
int i = 0;
|
||||
// There is likely a faster way to do that than iterating.
|
||||
for (auto v : vals) {
|
||||
(*indices)[traits<Derived>::NumDimensions - 1] = i++;
|
||||
tensor.coeffRef(*indices) = v;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Derived>
|
||||
struct Initializer<Derived, 0> {
|
||||
typedef typename traits<Derived>::Scalar InitList;
|
||||
|
||||
static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
|
||||
Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>*,
|
||||
const InitList& v) {
|
||||
tensor.coeffRef(0) = v;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename Derived, int N>
|
||||
void initialize_tensor(TensorEvaluator<Derived, DefaultDevice>& tensor,
|
||||
const typename Initializer<Derived, traits<Derived>::NumDimensions>::InitList& vals) {
|
||||
Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions> indices;
|
||||
Initializer<Derived, traits<Derived>::NumDimensions>::run(tensor, &indices, vals);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
|
|
@ -0,0 +1,253 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
|
||||
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class TensorIntDiv
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Fast integer division by a constant.
|
||||
*
|
||||
* See the paper from Granlund and Montgomery for explanation.
|
||||
* (at xxxp://dx.doi.org/10.1145/773473.178249)
|
||||
*
|
||||
* \sa Tensor
|
||||
*/
|
||||
|
||||
namespace internal {
|
||||
|
||||
namespace {
|
||||
|
||||
// Note: result is undefined if val == 0
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
return __clz(val);
|
||||
#elif EIGEN_COMP_MSVC
|
||||
unsigned long index;
|
||||
_BitScanReverse(&index, val);
|
||||
return 31 - index;
|
||||
#else
|
||||
EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return __builtin_clz(static_cast<uint32_t>(val));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
return __clzll(val);
|
||||
#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
|
||||
unsigned long index;
|
||||
_BitScanReverse64(&index, val);
|
||||
return 63 - index;
|
||||
#elif EIGEN_COMP_MSVC
|
||||
// MSVC's _BitScanReverse64 is not available for 32bits builds.
|
||||
unsigned int lo = (unsigned int)(val&0xffffffff);
|
||||
unsigned int hi = (unsigned int)((val>>32)&0xffffffff);
|
||||
int n;
|
||||
if(hi==0)
|
||||
n = 32 + count_leading_zeros<unsigned int>(lo);
|
||||
else
|
||||
n = count_leading_zeros<unsigned int>(hi);
|
||||
return n;
|
||||
#else
|
||||
EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return __builtin_clzll(static_cast<uint64_t>(val));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct UnsignedTraits {
|
||||
typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type type;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct DividerTraits {
|
||||
typedef typename UnsignedTraits<T>::type type;
|
||||
static const int N = sizeof(T) * 8;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
|
||||
#if defined(__CUDA_ARCH__)
|
||||
return __umulhi(a, b);
|
||||
#else
|
||||
return (static_cast<uint64_t>(a) * b) >> 32;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
|
||||
#if defined(__CUDA_ARCH__)
|
||||
return __umul64hi(a, b);
|
||||
#elif defined(__SIZEOF_INT128__)
|
||||
__uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
|
||||
return static_cast<uint64_t>(v >> 64);
|
||||
#else
|
||||
return (TensorUInt128<static_val<0>, uint64_t>(a) * TensorUInt128<static_val<0>, uint64_t>(b)).upper();
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int N, typename T>
|
||||
struct DividerHelper {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) {
|
||||
EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return static_cast<uint32_t>((static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct DividerHelper<64, T> {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
|
||||
#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__)
|
||||
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
|
||||
#else
|
||||
const uint64_t shift = 1ULL << log_div;
|
||||
TensorUInt128<uint64_t, uint64_t> result = TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider)
|
||||
- TensorUInt128<static_val<1>, static_val<0> >(1, 0)
|
||||
+ TensorUInt128<static_val<0>, static_val<1> >(1);
|
||||
return static_cast<uint64_t>(result);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
template <typename T, bool div_gt_one = false>
|
||||
struct TensorIntDivisor {
|
||||
public:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
|
||||
multiplier = 0;
|
||||
shift1 = 0;
|
||||
shift2 = 0;
|
||||
}
|
||||
|
||||
// Must have 0 < divider < 2^31. This is relaxed to
|
||||
// 0 < divider < 2^63 when using 64-bit indices on platforms that support
|
||||
// the __uint128_t type.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) {
|
||||
const int N = DividerTraits<T>::N;
|
||||
eigen_assert(static_cast<typename UnsignedTraits<T>::type>(divider) < NumTraits<UnsignedType>::highest()/2);
|
||||
eigen_assert(divider > 0);
|
||||
|
||||
// fast ln2
|
||||
const int leading_zeros = count_leading_zeros(static_cast<UnsignedType>(divider));
|
||||
int log_div = N - leading_zeros;
|
||||
// if divider is a power of two then log_div is 1 more than it should be.
|
||||
if ((static_cast<typename UnsignedTraits<T>::type>(1) << (log_div-1)) == static_cast<typename UnsignedTraits<T>::type>(divider))
|
||||
log_div--;
|
||||
|
||||
multiplier = DividerHelper<N, T>::computeMultiplier(log_div, divider);
|
||||
shift1 = log_div > 1 ? 1 : log_div;
|
||||
shift2 = log_div > 1 ? log_div-1 : 0;
|
||||
}
|
||||
|
||||
// Must have 0 <= numerator. On platforms that dont support the __uint128_t
|
||||
// type numerator should also be less than 2^32-1.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
|
||||
eigen_assert(static_cast<typename UnsignedTraits<T>::type>(numerator) < NumTraits<UnsignedType>::highest()/2);
|
||||
//eigen_assert(numerator >= 0); // this is implicitly asserted by the line above
|
||||
|
||||
UnsignedType t1 = muluh(multiplier, numerator);
|
||||
UnsignedType t = (static_cast<UnsignedType>(numerator) - t1) >> shift1;
|
||||
return (t1 + t) >> shift2;
|
||||
}
|
||||
|
||||
private:
|
||||
typedef typename DividerTraits<T>::type UnsignedType;
|
||||
UnsignedType multiplier;
|
||||
int32_t shift1;
|
||||
int32_t shift2;
|
||||
};
|
||||
|
||||
|
||||
// Optimized version for signed 32 bit integers.
|
||||
// Derived from Hacker's Delight.
|
||||
// Only works for divisors strictly greater than one
|
||||
template <>
|
||||
class TensorIntDivisor<int32_t, true> {
|
||||
public:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
|
||||
magic = 0;
|
||||
shift = 0;
|
||||
}
|
||||
// Must have 2 <= divider
|
||||
EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider) {
|
||||
eigen_assert(divider >= 2);
|
||||
calcMagic(divider);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
|
||||
#ifdef __CUDA_ARCH__
|
||||
return (__umulhi(magic, n) >> shift);
|
||||
#else
|
||||
uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
|
||||
return (static_cast<uint32_t>(v >> 32) >> shift);
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
// Compute the magic numbers. See Hacker's Delight section 10 for an in
|
||||
// depth explanation.
|
||||
EIGEN_DEVICE_FUNC void calcMagic(int32_t d) {
|
||||
const unsigned two31 = 0x80000000; // 2**31.
|
||||
unsigned ad = d;
|
||||
unsigned t = two31 + (ad >> 31);
|
||||
unsigned anc = t - 1 - t%ad; // Absolute value of nc.
|
||||
int p = 31; // Init. p.
|
||||
unsigned q1 = two31/anc; // Init. q1 = 2**p/|nc|.
|
||||
unsigned r1 = two31 - q1*anc; // Init. r1 = rem(2**p, |nc|).
|
||||
unsigned q2 = two31/ad; // Init. q2 = 2**p/|d|.
|
||||
unsigned r2 = two31 - q2*ad; // Init. r2 = rem(2**p, |d|).
|
||||
unsigned delta = 0;
|
||||
do {
|
||||
p = p + 1;
|
||||
q1 = 2*q1; // Update q1 = 2**p/|nc|.
|
||||
r1 = 2*r1; // Update r1 = rem(2**p, |nc|).
|
||||
if (r1 >= anc) { // (Must be an unsigned
|
||||
q1 = q1 + 1; // comparison here).
|
||||
r1 = r1 - anc;}
|
||||
q2 = 2*q2; // Update q2 = 2**p/|d|.
|
||||
r2 = 2*r2; // Update r2 = rem(2**p, |d|).
|
||||
if (r2 >= ad) { // (Must be an unsigned
|
||||
q2 = q2 + 1; // comparison here).
|
||||
r2 = r2 - ad;}
|
||||
delta = ad - r2;
|
||||
} while (q1 < delta || (q1 == delta && r1 == 0));
|
||||
|
||||
magic = (unsigned)(q2 + 1);
|
||||
shift = p - 32;
|
||||
}
|
||||
|
||||
uint32_t magic;
|
||||
int32_t shift;
|
||||
};
|
||||
|
||||
|
||||
template <typename T, bool div_gt_one>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T, div_gt_one>& divisor) {
|
||||
return divisor.divide(numerator);
|
||||
}
|
||||
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
|
|
@ -0,0 +1,209 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorLayoutSwap
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Swap the layout from col-major to row-major, or row-major
|
||||
* to col-major, and invert the order of the dimensions.
|
||||
*
|
||||
* Beware: the dimensions are reversed by this operation. If you want to
|
||||
* preserve the ordering of the dimensions, you need to combine this
|
||||
* operation with a shuffle.
|
||||
*
|
||||
* \example:
|
||||
* Tensor<float, 2, ColMajor> input(2, 4);
|
||||
* Tensor<float, 2, RowMajor> output = input.swap_layout();
|
||||
* eigen_assert(output.dimension(0) == 4);
|
||||
* eigen_assert(output.dimension(1) == 2);
|
||||
*
|
||||
* array<int, 2> shuffle(1, 0);
|
||||
* output = input.swap_layout().shuffle(shuffle);
|
||||
* eigen_assert(output.dimension(0) == 2);
|
||||
* eigen_assert(output.dimension(1) == 4);
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename XprType>
|
||||
struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = traits<XprType>::NumDimensions;
|
||||
static const int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor;
|
||||
};
|
||||
|
||||
template<typename XprType>
|
||||
struct eval<TensorLayoutSwapOp<XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorLayoutSwapOp<XprType>& type;
|
||||
};
|
||||
|
||||
template<typename XprType>
|
||||
struct nested<TensorLayoutSwapOp<XprType>, 1, typename eval<TensorLayoutSwapOp<XprType> >::type>
|
||||
{
|
||||
typedef TensorLayoutSwapOp<XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename XprType>
|
||||
class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
|
||||
: m_xpr(expr) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorLayoutSwapOp, const TensorLayoutSwapOp> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorLayoutSwapOp, const OtherDerived> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
};
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
|
||||
{
|
||||
typedef TensorLayoutSwapOp<ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device)
|
||||
{
|
||||
for(int i = 0; i < NumDims; ++i) {
|
||||
m_dimensions[i] = m_impl.dimensions()[NumDims-1-i];
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
|
||||
return m_impl.evalSubExprsIfNeeded(data);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_impl.coeff(index);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
return m_impl.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return m_impl.costPerCoeff(vectorized);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
|
||||
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
|
||||
protected:
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
Dimensions m_dimensions;
|
||||
};
|
||||
|
||||
|
||||
// Eval as lvalue
|
||||
template<typename ArgType, typename Device>
|
||||
struct TensorEvaluator<TensorLayoutSwapOp<ArgType>, Device>
|
||||
: public TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
|
||||
{
|
||||
typedef TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> Base;
|
||||
typedef TensorLayoutSwapOp<ArgType> XprType;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
|
||||
CoordAccess = false // to be implemented
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: Base(op, device)
|
||||
{ }
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
{
|
||||
return this->m_impl.coeffRef(index);
|
||||
}
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
{
|
||||
this->m_impl.template writePacket<StoreMode>(index, x);
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
|
|
@ -0,0 +1,54 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
|
||||
|
||||
|
||||
/** use this macro in sfinae selection in templated functions
|
||||
*
|
||||
* template<typename T,
|
||||
* typename std::enable_if< isBanana<T>::value , int >::type = 0
|
||||
* >
|
||||
* void foo(){}
|
||||
*
|
||||
* becomes =>
|
||||
*
|
||||
* template<typename TopoType,
|
||||
* SFINAE_ENABLE_IF( isBanana<T>::value )
|
||||
* >
|
||||
* void foo(){}
|
||||
*/
|
||||
|
||||
// SFINAE requires variadic templates
|
||||
#ifndef __CUDACC__
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
// SFINAE doesn't work for gcc <= 4.7
|
||||
#ifdef EIGEN_COMP_GNUC
|
||||
#if EIGEN_GNUC_AT_LEAST(4,8)
|
||||
#define EIGEN_HAS_SFINAE
|
||||
#endif
|
||||
#else
|
||||
#define EIGEN_HAS_SFINAE
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \
|
||||
typename internal::enable_if< ( __condition__ ) , int >::type = 0
|
||||
|
||||
|
||||
#if EIGEN_HAS_CONSTEXPR
|
||||
#define EIGEN_CONSTEXPR constexpr
|
||||
#else
|
||||
#define EIGEN_CONSTEXPR
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
|
@ -0,0 +1,323 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_)
|
||||
|
||||
/** \class TensorMap
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief A tensor expression mapping an existing array of data.
|
||||
*
|
||||
*/
|
||||
/// `template <class> class MakePointer_` is added to convert the host pointer to the device pointer.
|
||||
/// It is added due to the fact that for our device compiler `T*` is not allowed.
|
||||
/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`.
|
||||
/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_<T>` is `T*` .
|
||||
/// Therefore, by adding the default value, we managed to convert the type and it does not break any
|
||||
/// existing code as its default value is `T*`.
|
||||
template<typename PlainObjectType, int Options_, template <class> class MakePointer_> class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> >
|
||||
{
|
||||
public:
|
||||
typedef TensorMap<PlainObjectType, Options_, MakePointer_> Self;
|
||||
typedef typename PlainObjectType::Base Base;
|
||||
typedef typename Eigen::internal::nested<Self>::type Nested;
|
||||
typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
|
||||
typedef typename internal::traits<PlainObjectType>::Index Index;
|
||||
typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename Base::CoeffReturnType CoeffReturnType;
|
||||
|
||||
/* typedef typename internal::conditional<
|
||||
bool(internal::is_lvalue<PlainObjectType>::value),
|
||||
Scalar *,
|
||||
const Scalar *>::type
|
||||
PointerType;*/
|
||||
typedef typename MakePointer_<Scalar>::Type PointerType;
|
||||
typedef PointerType PointerArgType;
|
||||
|
||||
static const int Options = Options_;
|
||||
|
||||
static const Index NumIndices = PlainObjectType::NumIndices;
|
||||
typedef typename PlainObjectType::Dimensions Dimensions;
|
||||
|
||||
enum {
|
||||
IsAligned = ((int(Options_)&Aligned)==Aligned),
|
||||
Layout = PlainObjectType::Layout,
|
||||
CoordAccess = true,
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() {
|
||||
// The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
|
||||
EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
|
||||
// The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
|
||||
EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
}
|
||||
#else
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
|
||||
// The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
|
||||
EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
|
||||
EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
|
||||
EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
|
||||
EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
|
||||
EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions)
|
||||
: m_data(dataPtr), m_dimensions(dimensions)
|
||||
{ }
|
||||
|
||||
template <typename Dimensions>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions)
|
||||
: m_data(dataPtr), m_dimensions(dimensions)
|
||||
{ }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor)
|
||||
: m_data(tensor.data()), m_dimensions(tensor.dimensions())
|
||||
{ }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); }
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE PointerType data() { return m_data; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const PointerType data() const { return m_data; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
|
||||
{
|
||||
// eigen_assert(checkIndexRange(indices));
|
||||
if (PlainObjectType::Options&RowMajor) {
|
||||
const Index index = m_dimensions.IndexOfRowMajor(indices);
|
||||
return m_data[index];
|
||||
} else {
|
||||
const Index index = m_dimensions.IndexOfColMajor(indices);
|
||||
return m_data[index];
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()() const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
return m_data[0];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
|
||||
{
|
||||
eigen_internal_assert(index >= 0 && index < size());
|
||||
return m_data[index];
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
if (PlainObjectType::Options&RowMajor) {
|
||||
const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
|
||||
return m_data[index];
|
||||
} else {
|
||||
const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
|
||||
return m_data[index];
|
||||
}
|
||||
}
|
||||
#else
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
|
||||
{
|
||||
if (PlainObjectType::Options&RowMajor) {
|
||||
const Index index = i1 + i0 * m_dimensions[1];
|
||||
return m_data[index];
|
||||
} else {
|
||||
const Index index = i0 + i1 * m_dimensions[0];
|
||||
return m_data[index];
|
||||
}
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
|
||||
{
|
||||
if (PlainObjectType::Options&RowMajor) {
|
||||
const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
|
||||
return m_data[index];
|
||||
} else {
|
||||
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
|
||||
return m_data[index];
|
||||
}
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
|
||||
{
|
||||
if (PlainObjectType::Options&RowMajor) {
|
||||
const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
|
||||
return m_data[index];
|
||||
} else {
|
||||
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
|
||||
return m_data[index];
|
||||
}
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
|
||||
{
|
||||
if (PlainObjectType::Options&RowMajor) {
|
||||
const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
|
||||
return m_data[index];
|
||||
} else {
|
||||
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
|
||||
return m_data[index];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
|
||||
{
|
||||
// eigen_assert(checkIndexRange(indices));
|
||||
if (PlainObjectType::Options&RowMajor) {
|
||||
const Index index = m_dimensions.IndexOfRowMajor(indices);
|
||||
return m_data[index];
|
||||
} else {
|
||||
const Index index = m_dimensions.IndexOfColMajor(indices);
|
||||
return m_data[index];
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()()
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
return m_data[0];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index index)
|
||||
{
|
||||
eigen_internal_assert(index >= 0 && index < size());
|
||||
return m_data[index];
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
|
||||
{
|
||||
static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
|
||||
const std::size_t NumDims = sizeof...(otherIndices) + 2;
|
||||
if (PlainObjectType::Options&RowMajor) {
|
||||
const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
|
||||
return m_data[index];
|
||||
} else {
|
||||
const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
|
||||
return m_data[index];
|
||||
}
|
||||
}
|
||||
#else
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
|
||||
{
|
||||
if (PlainObjectType::Options&RowMajor) {
|
||||
const Index index = i1 + i0 * m_dimensions[1];
|
||||
return m_data[index];
|
||||
} else {
|
||||
const Index index = i0 + i1 * m_dimensions[0];
|
||||
return m_data[index];
|
||||
}
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
|
||||
{
|
||||
if (PlainObjectType::Options&RowMajor) {
|
||||
const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
|
||||
return m_data[index];
|
||||
} else {
|
||||
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
|
||||
return m_data[index];
|
||||
}
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
|
||||
{
|
||||
if (PlainObjectType::Options&RowMajor) {
|
||||
const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
|
||||
return m_data[index];
|
||||
} else {
|
||||
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
|
||||
return m_data[index];
|
||||
}
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
|
||||
{
|
||||
if (PlainObjectType::Options&RowMajor) {
|
||||
const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
|
||||
return m_data[index];
|
||||
} else {
|
||||
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
|
||||
return m_data[index];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other)
|
||||
{
|
||||
typedef TensorAssignOp<Self, const Self> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Self& operator=(const OtherDerived& other)
|
||||
{
|
||||
typedef TensorAssignOp<Self, const OtherDerived> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
typename MakePointer_<Scalar>::Type m_data;
|
||||
Dimensions m_dimensions;
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H
|
|
@ -0,0 +1,218 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_META_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
template<bool cond> struct Cond {};
|
||||
|
||||
template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
const T1& choose(Cond<true>, const T1& first, const T2&) {
|
||||
return first;
|
||||
}
|
||||
|
||||
template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
const T2& choose(Cond<false>, const T1&, const T2& second) {
|
||||
return second;
|
||||
}
|
||||
|
||||
|
||||
template <typename T, typename X, typename Y>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T divup(const X x, const Y y) {
|
||||
return static_cast<T>((x + y - 1) / y);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T divup(const T x, const T y) {
|
||||
return static_cast<T>((x + y - 1) / y);
|
||||
}
|
||||
|
||||
template <size_t n> struct max_n_1 {
|
||||
static const size_t size = n;
|
||||
};
|
||||
template <> struct max_n_1<0> {
|
||||
static const size_t size = 1;
|
||||
};
|
||||
|
||||
|
||||
// Default packet types
|
||||
template <typename Scalar, typename Device>
|
||||
struct PacketType : internal::packet_traits<Scalar> {
|
||||
typedef typename internal::packet_traits<Scalar>::type type;
|
||||
};
|
||||
|
||||
// For CUDA packet types when using a GpuDevice
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
|
||||
template <>
|
||||
struct PacketType<half, GpuDevice> {
|
||||
typedef half2 type;
|
||||
static const int size = 2;
|
||||
enum {
|
||||
HasAdd = 1,
|
||||
HasSub = 1,
|
||||
HasMul = 1,
|
||||
HasNegate = 1,
|
||||
HasAbs = 1,
|
||||
HasArg = 0,
|
||||
HasAbs2 = 0,
|
||||
HasMin = 1,
|
||||
HasMax = 1,
|
||||
HasConj = 0,
|
||||
HasSetLinear = 0,
|
||||
HasBlend = 0,
|
||||
|
||||
HasDiv = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasExp = 1,
|
||||
HasLog = 1,
|
||||
HasLog1p = 0,
|
||||
HasLog10 = 0,
|
||||
HasPow = 1,
|
||||
};
|
||||
};
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_USE_SYCL)
|
||||
template <typename T>
|
||||
struct PacketType<T, SyclDevice> {
|
||||
typedef T type;
|
||||
static const int size = 1;
|
||||
enum {
|
||||
HasAdd = 0,
|
||||
HasSub = 0,
|
||||
HasMul = 0,
|
||||
HasNegate = 0,
|
||||
HasAbs = 0,
|
||||
HasArg = 0,
|
||||
HasAbs2 = 0,
|
||||
HasMin = 0,
|
||||
HasMax = 0,
|
||||
HasConj = 0,
|
||||
HasSetLinear = 0,
|
||||
HasBlend = 0
|
||||
};
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
// Tuple mimics std::pair but works on e.g. nvcc.
|
||||
template <typename U, typename V> struct Tuple {
|
||||
public:
|
||||
U first;
|
||||
V second;
|
||||
|
||||
typedef U first_type;
|
||||
typedef V second_type;
|
||||
|
||||
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Tuple() : first(), second() {}
|
||||
|
||||
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Tuple(const U& f, const V& s) : first(f), second(s) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Tuple& operator= (const Tuple& rhs) {
|
||||
if (&rhs == this) return *this;
|
||||
first = rhs.first;
|
||||
second = rhs.second;
|
||||
return *this;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void swap(Tuple& rhs) {
|
||||
using numext::swap;
|
||||
swap(first, rhs.first);
|
||||
swap(second, rhs.second);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename U, typename V>
|
||||
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
bool operator==(const Tuple<U, V>& x, const Tuple<U, V>& y) {
|
||||
return (x.first == y.first && x.second == y.second);
|
||||
}
|
||||
|
||||
template <typename U, typename V>
|
||||
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
bool operator!=(const Tuple<U, V>& x, const Tuple<U, V>& y) {
|
||||
return !(x == y);
|
||||
}
|
||||
|
||||
|
||||
// Can't use std::pairs on cuda devices
|
||||
template <typename Idx> struct IndexPair {
|
||||
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
|
||||
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) {
|
||||
first = val.first;
|
||||
second = val.second;
|
||||
}
|
||||
|
||||
Idx first;
|
||||
Idx second;
|
||||
};
|
||||
|
||||
|
||||
#ifdef EIGEN_HAS_SFINAE
|
||||
namespace internal {
|
||||
|
||||
template<typename IndexType, Index... Is>
|
||||
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
array<Index, sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, Is...>) {
|
||||
return { idx[Is]... };
|
||||
}
|
||||
template<typename IndexType>
|
||||
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) {
|
||||
return array<Index, 0>();
|
||||
}
|
||||
|
||||
/** Make an array (for index/dimensions) out of a custom index */
|
||||
template<typename Index, std::size_t NumIndices, typename IndexType>
|
||||
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
array<Index, NumIndices> customIndices2Array(IndexType& idx) {
|
||||
return customIndices2Array(idx, typename gen_numeric_list<Index, NumIndices>::type{});
|
||||
}
|
||||
|
||||
|
||||
template <typename B, typename D>
|
||||
struct is_base_of
|
||||
{
|
||||
|
||||
typedef char (&yes)[1];
|
||||
typedef char (&no)[2];
|
||||
|
||||
template <typename BB, typename DD>
|
||||
struct Host
|
||||
{
|
||||
operator BB*() const;
|
||||
operator DD*();
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
static yes check(D*, T);
|
||||
static no check(B*, int);
|
||||
|
||||
static const bool value = sizeof(check(Host<B,D>(), int())) == sizeof(yes);
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_META_H
|
|
@ -0,0 +1,888 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorReshaping
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor reshaping class.
|
||||
*
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename NewDimensions, typename XprType>
|
||||
struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = array_size<NewDimensions>::value;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<typename NewDimensions, typename XprType>
|
||||
struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorReshapingOp<NewDimensions, XprType>& type;
|
||||
};
|
||||
|
||||
template<typename NewDimensions, typename XprType>
|
||||
struct nested<TensorReshapingOp<NewDimensions, XprType>, 1, typename eval<TensorReshapingOp<NewDimensions, XprType> >::type>
|
||||
{
|
||||
typedef TensorReshapingOp<NewDimensions, XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename NewDimensions, typename XprType>
|
||||
class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorReshapingOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims)
|
||||
: m_xpr(expr), m_dims(dims) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const NewDimensions& dimensions() const { return m_dims; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorReshapingOp, const TensorReshapingOp> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorReshapingOp, const OtherDerived> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const NewDimensions m_dims;
|
||||
};
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename NewDimensions, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
||||
{
|
||||
typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
|
||||
typedef NewDimensions Dimensions;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_dimensions(op.dimensions())
|
||||
{
|
||||
// The total size of the reshaped tensor must be equal to the total size
|
||||
// of the input tensor.
|
||||
eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
|
||||
}
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
|
||||
return m_impl.evalSubExprsIfNeeded(data);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_impl.coeff(index);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
return m_impl.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return m_impl.costPerCoeff(vectorized);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
|
||||
|
||||
EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
|
||||
protected:
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
NewDimensions m_dimensions;
|
||||
};
|
||||
|
||||
|
||||
// Eval as lvalue
|
||||
template<typename NewDimensions, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<TensorReshapingOp<NewDimensions, ArgType>, Device>
|
||||
: public TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
||||
|
||||
{
|
||||
typedef TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> Base;
|
||||
typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
|
||||
typedef NewDimensions Dimensions;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: Base(op, device)
|
||||
{ }
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
{
|
||||
return this->m_impl.coeffRef(index);
|
||||
}
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
{
|
||||
this->m_impl.template writePacket<StoreMode>(index, x);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/** \class TensorSlicing
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor slicing class.
|
||||
*
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename StartIndices, typename Sizes, typename XprType>
|
||||
struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = array_size<StartIndices>::value;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<typename StartIndices, typename Sizes, typename XprType>
|
||||
struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorSlicingOp<StartIndices, Sizes, XprType>& type;
|
||||
};
|
||||
|
||||
template<typename StartIndices, typename Sizes, typename XprType>
|
||||
struct nested<TensorSlicingOp<StartIndices, Sizes, XprType>, 1, typename eval<TensorSlicingOp<StartIndices, Sizes, XprType> >::type>
|
||||
{
|
||||
typedef TensorSlicingOp<StartIndices, Sizes, XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename StartIndices, typename Sizes, typename XprType>
|
||||
class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> >
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorSlicingOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes)
|
||||
: m_xpr(expr), m_indices(indices), m_sizes(sizes) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const StartIndices& startIndices() const { return m_indices; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
const Sizes& sizes() const { return m_sizes; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorSlicingOp, const OtherDerived> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorSlicingOp, const TensorSlicingOp> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const StartIndices m_indices;
|
||||
const Sizes m_sizes;
|
||||
};
|
||||
|
||||
|
||||
// Fixme: figure out the exact threshold
|
||||
namespace {
|
||||
template <typename Index, typename Device> struct MemcpyTriggerForSlicing {
|
||||
EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { }
|
||||
EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > threshold_; }
|
||||
|
||||
private:
|
||||
Index threshold_;
|
||||
};
|
||||
|
||||
// It is very expensive to start the memcpy kernel on GPU: we therefore only
|
||||
// use it for large copies.
|
||||
#ifdef EIGEN_USE_GPU
|
||||
template <typename Index> struct MemcpyTriggerForSlicing<Index, GpuDevice> {
|
||||
EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { }
|
||||
EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; }
|
||||
};
|
||||
#endif
|
||||
}
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
||||
{
|
||||
typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
|
||||
static const int NumDims = internal::array_size<Sizes>::value;
|
||||
|
||||
enum {
|
||||
// Alignment can't be guaranteed at compile time since it depends on the
|
||||
// slice offsets and sizes.
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
|
||||
{
|
||||
for (std::size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
|
||||
eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
|
||||
}
|
||||
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
const Sizes& output_dims = op.sizes();
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_inputStrides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
|
||||
}
|
||||
|
||||
// Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
|
||||
m_outputStrides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
|
||||
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
|
||||
}
|
||||
} else {
|
||||
m_inputStrides[NumDims-1] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
|
||||
}
|
||||
|
||||
// Don't initialize m_fastOutputStrides[NumDims-1] since it won't ever be accessed.
|
||||
m_outputStrides[NumDims-1] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
|
||||
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef Sizes Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data && m_impl.data()) {
|
||||
Index contiguous_values = 1;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
contiguous_values *= dimensions()[i];
|
||||
if (dimensions()[i] != m_impl.dimensions()[i]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = NumDims-1; i >= 0; --i) {
|
||||
contiguous_values *= dimensions()[i];
|
||||
if (dimensions()[i] != m_impl.dimensions()[i]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Use memcpy if it's going to be faster than using the regular evaluation.
|
||||
const MemcpyTriggerForSlicing<Index, Device> trigger(m_device);
|
||||
if (trigger(contiguous_values)) {
|
||||
Scalar* src = (Scalar*)m_impl.data();
|
||||
for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
|
||||
Index offset = srcCoeff(i);
|
||||
m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_impl.coeff(srcCoeff(index));
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
|
||||
|
||||
Index inputIndices[] = {0, 0};
|
||||
Index indices[] = {index, index + packetSize - 1};
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx0 = indices[0] / m_fastOutputStrides[i];
|
||||
const Index idx1 = indices[1] / m_fastOutputStrides[i];
|
||||
inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
|
||||
inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
|
||||
indices[0] -= idx0 * m_outputStrides[i];
|
||||
indices[1] -= idx1 * m_outputStrides[i];
|
||||
}
|
||||
inputIndices[0] += (indices[0] + m_offsets[0]);
|
||||
inputIndices[1] += (indices[1] + m_offsets[0]);
|
||||
} else {
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const Index idx0 = indices[0] / m_fastOutputStrides[i];
|
||||
const Index idx1 = indices[1] / m_fastOutputStrides[i];
|
||||
inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
|
||||
inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
|
||||
indices[0] -= idx0 * m_outputStrides[i];
|
||||
indices[1] -= idx1 * m_outputStrides[i];
|
||||
}
|
||||
inputIndices[0] += (indices[0] + m_offsets[NumDims-1]);
|
||||
inputIndices[1] += (indices[1] + m_offsets[NumDims-1]);
|
||||
}
|
||||
if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
|
||||
PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
|
||||
return rslt;
|
||||
}
|
||||
else {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
values[0] = m_impl.coeff(inputIndices[0]);
|
||||
values[packetSize-1] = m_impl.coeff(inputIndices[1]);
|
||||
for (int i = 1; i < packetSize-1; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
|
||||
Scalar* result = m_impl.data();
|
||||
if (result) {
|
||||
Index offset = 0;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
if (m_dimensions[i] != m_impl.dimensions()[i]) {
|
||||
offset += m_offsets[i] * m_inputStrides[i];
|
||||
for (int j = i+1; j < NumDims; ++j) {
|
||||
if (m_dimensions[j] > 1) {
|
||||
return NULL;
|
||||
}
|
||||
offset += m_offsets[j] * m_inputStrides[j];
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = NumDims - 1; i >= 0; --i) {
|
||||
if (m_dimensions[i] != m_impl.dimensions()[i]) {
|
||||
offset += m_offsets[i] * m_inputStrides[i];
|
||||
for (int j = i-1; j >= 0; --j) {
|
||||
if (m_dimensions[j] > 1) {
|
||||
return NULL;
|
||||
}
|
||||
offset += m_offsets[j] * m_inputStrides[j];
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return result + offset;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
|
||||
{
|
||||
Index inputIndex = 0;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx = index / m_fastOutputStrides[i];
|
||||
inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
inputIndex += (index + m_offsets[0]);
|
||||
} else {
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const Index idx = index / m_fastOutputStrides[i];
|
||||
inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
inputIndex += (index + m_offsets[NumDims-1]);
|
||||
}
|
||||
return inputIndex;
|
||||
}
|
||||
|
||||
array<Index, NumDims> m_outputStrides;
|
||||
array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
|
||||
array<Index, NumDims> m_inputStrides;
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
const Device& m_device;
|
||||
Dimensions m_dimensions;
|
||||
const StartIndices m_offsets;
|
||||
};
|
||||
|
||||
|
||||
// Eval as lvalue
|
||||
template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
||||
: public TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
||||
{
|
||||
typedef TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> Base;
|
||||
typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
|
||||
static const int NumDims = internal::array_size<Sizes>::value;
|
||||
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: Base(op, device)
|
||||
{ }
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef Sizes Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
{
|
||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||
}
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
Index inputIndices[] = {0, 0};
|
||||
Index indices[] = {index, index + packetSize - 1};
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
|
||||
const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
|
||||
inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
|
||||
inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
|
||||
indices[0] -= idx0 * this->m_outputStrides[i];
|
||||
indices[1] -= idx1 * this->m_outputStrides[i];
|
||||
}
|
||||
inputIndices[0] += (indices[0] + this->m_offsets[0]);
|
||||
inputIndices[1] += (indices[1] + this->m_offsets[0]);
|
||||
} else {
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
|
||||
const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
|
||||
inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
|
||||
inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
|
||||
indices[0] -= idx0 * this->m_outputStrides[i];
|
||||
indices[1] -= idx1 * this->m_outputStrides[i];
|
||||
}
|
||||
inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]);
|
||||
inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]);
|
||||
}
|
||||
if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
|
||||
this->m_impl.template writePacket<StoreMode>(inputIndices[0], x);
|
||||
}
|
||||
else {
|
||||
EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
this->m_impl.coeffRef(inputIndices[0]) = values[0];
|
||||
this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
|
||||
for (int i = 1; i < packetSize-1; ++i) {
|
||||
this->coeffRef(index+i) = values[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
namespace internal {
|
||||
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
|
||||
struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = array_size<StartIndices>::value;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
|
||||
struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>& type;
|
||||
};
|
||||
|
||||
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
|
||||
struct nested<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, 1, typename eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >::type>
|
||||
{
|
||||
typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
|
||||
class TensorStridingSlicingOp : public TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >
|
||||
{
|
||||
public:
|
||||
typedef typename internal::traits<TensorStridingSlicingOp>::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename internal::nested<TensorStridingSlicingOp>::type Nested;
|
||||
typedef typename internal::traits<TensorStridingSlicingOp>::StorageKind StorageKind;
|
||||
typedef typename internal::traits<TensorStridingSlicingOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp(
|
||||
const XprType& expr, const StartIndices& startIndices,
|
||||
const StopIndices& stopIndices, const Strides& strides)
|
||||
: m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices),
|
||||
m_strides(strides) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const StartIndices& startIndices() const { return m_startIndices; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
const StartIndices& stopIndices() const { return m_stopIndices; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
const StartIndices& strides() const { return m_strides; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorStridingSlicingOp, const TensorStridingSlicingOp> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(
|
||||
assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorStridingSlicingOp, const OtherDerived> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(
|
||||
assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const StartIndices m_startIndices;
|
||||
const StopIndices m_stopIndices;
|
||||
const Strides m_strides;
|
||||
};
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
|
||||
{
|
||||
typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
|
||||
static const int NumDims = internal::array_size<Strides>::value;
|
||||
|
||||
enum {
|
||||
// Alignment can't be guaranteed at compile time since it depends on the
|
||||
// slice offsets and sizes.
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_device(device), m_strides(op.strides())
|
||||
{
|
||||
// Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
|
||||
DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
|
||||
for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
|
||||
eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
|
||||
if(m_strides[i]>0){
|
||||
startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
|
||||
stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
|
||||
}else{
|
||||
/* implies m_strides[i]<0 by assert */
|
||||
startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
|
||||
stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
|
||||
}
|
||||
m_startIndices[i] = startIndicesClamped[i];
|
||||
}
|
||||
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
|
||||
// check for degenerate intervals and compute output tensor shape
|
||||
bool degenerate = false;;
|
||||
for(int i = 0; i < NumDims; i++){
|
||||
Index interval = stopIndicesClamped[i] - startIndicesClamped[i];
|
||||
if(interval == 0 || ((interval<0) != (m_strides[i]<0))){
|
||||
m_dimensions[i] = 0;
|
||||
degenerate = true;
|
||||
}else{
|
||||
m_dimensions[i] = interval / m_strides[i]
|
||||
+ (interval % m_strides[i] != 0 ? 1 : 0);
|
||||
eigen_assert(m_dimensions[i] >= 0);
|
||||
}
|
||||
}
|
||||
Strides output_dims = m_dimensions;
|
||||
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_inputStrides[0] = m_strides[0];
|
||||
m_offsets[0] = startIndicesClamped[0];
|
||||
Index previousDimProduct = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
previousDimProduct *= input_dims[i-1];
|
||||
m_inputStrides[i] = previousDimProduct * m_strides[i];
|
||||
m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
|
||||
}
|
||||
|
||||
// Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
|
||||
m_outputStrides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
|
||||
// NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
|
||||
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
|
||||
}
|
||||
} else {
|
||||
m_inputStrides[NumDims-1] = m_strides[NumDims-1];
|
||||
m_offsets[NumDims-1] = startIndicesClamped[NumDims-1];
|
||||
Index previousDimProduct = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
previousDimProduct *= input_dims[i+1];
|
||||
m_inputStrides[i] = previousDimProduct * m_strides[i];
|
||||
m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
|
||||
}
|
||||
|
||||
m_outputStrides[NumDims-1] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
|
||||
// NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
|
||||
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
|
||||
}
|
||||
}
|
||||
m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
|
||||
device.lastLevelCacheSize() /
|
||||
sizeof(Scalar));
|
||||
}
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef Strides Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_impl.coeff(srcCoeff(index));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
|
||||
{
|
||||
Index inputIndex = 0;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i >= 0; --i) {
|
||||
const Index idx = index / m_fastOutputStrides[i];
|
||||
inputIndex += idx * m_inputStrides[i] + m_offsets[i];
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
const Index idx = index / m_fastOutputStrides[i];
|
||||
inputIndex += idx * m_inputStrides[i] + m_offsets[i];
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
}
|
||||
return inputIndex;
|
||||
}
|
||||
|
||||
static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
|
||||
return numext::maxi(min, numext::mini(max,value));
|
||||
}
|
||||
|
||||
array<Index, NumDims> m_outputStrides;
|
||||
array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
|
||||
array<Index, NumDims> m_inputStrides;
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
const Device& m_device;
|
||||
DSizes<Index, NumDims> m_startIndices; // clamped startIndices
|
||||
DSizes<Index, NumDims> m_dimensions;
|
||||
DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
|
||||
const Strides m_strides;
|
||||
std::size_t m_block_total_size_max;
|
||||
};
|
||||
|
||||
// Eval as lvalue
|
||||
template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
|
||||
: public TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
|
||||
{
|
||||
typedef TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> Base;
|
||||
typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
|
||||
static const int NumDims = internal::array_size<Strides>::value;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: Base(op, device)
|
||||
{ }
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef Strides Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
{
|
||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
|
|
@ -0,0 +1,397 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorPadding
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor padding class.
|
||||
* At the moment only padding with a constant value is supported.
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename PaddingDimensions, typename XprType>
|
||||
struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<typename PaddingDimensions, typename XprType>
|
||||
struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorPaddingOp<PaddingDimensions, XprType>& type;
|
||||
};
|
||||
|
||||
template<typename PaddingDimensions, typename XprType>
|
||||
struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1, typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type>
|
||||
{
|
||||
typedef TensorPaddingOp<PaddingDimensions, XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename PaddingDimensions, typename XprType>
|
||||
class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value)
|
||||
: m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const PaddingDimensions& padding() const { return m_padding_dims; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
Scalar padding_value() const { return m_padding_value; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const PaddingDimensions m_padding_dims;
|
||||
const Scalar m_padding_value;
|
||||
};
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename PaddingDimensions, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device>
|
||||
{
|
||||
typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<PaddingDimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = true,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = true,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value())
|
||||
{
|
||||
// The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
|
||||
// to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
|
||||
// of 1 element first and then pad.
|
||||
EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
|
||||
// Compute dimensions
|
||||
m_dimensions = m_impl.dimensions();
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
m_dimensions[i] += m_padding[i].first + m_padding[i].second;
|
||||
}
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_inputStrides[0] = 1;
|
||||
m_outputStrides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
|
||||
m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
|
||||
}
|
||||
m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1];
|
||||
} else {
|
||||
m_inputStrides[NumDims - 1] = 1;
|
||||
m_outputStrides[NumDims] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
|
||||
m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1];
|
||||
}
|
||||
m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0];
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
eigen_assert(index < dimensions().TotalSize());
|
||||
Index inputIndex = 0;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx = index / m_outputStrides[i];
|
||||
if (isPaddingAtIndexForDim(idx, i)) {
|
||||
return m_paddingValue;
|
||||
}
|
||||
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
if (isPaddingAtIndexForDim(index, 0)) {
|
||||
return m_paddingValue;
|
||||
}
|
||||
inputIndex += (index - m_padding[0].first);
|
||||
} else {
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const Index idx = index / m_outputStrides[i+1];
|
||||
if (isPaddingAtIndexForDim(idx, i)) {
|
||||
return m_paddingValue;
|
||||
}
|
||||
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
|
||||
index -= idx * m_outputStrides[i+1];
|
||||
}
|
||||
if (isPaddingAtIndexForDim(index, NumDims-1)) {
|
||||
return m_paddingValue;
|
||||
}
|
||||
inputIndex += (index - m_padding[NumDims-1].first);
|
||||
}
|
||||
return m_impl.coeff(inputIndex);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
return packetColMajor(index);
|
||||
}
|
||||
return packetRowMajor(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
TensorOpCost cost = m_impl.costPerCoeff(vectorized);
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = 0; i < NumDims; ++i)
|
||||
updateCostPerDimension(cost, i, i == 0);
|
||||
} else {
|
||||
for (int i = NumDims - 1; i >= 0; --i)
|
||||
updateCostPerDimension(cost, i, i == NumDims - 1);
|
||||
}
|
||||
return cost;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
private:
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
|
||||
Index index, int dim_index) const {
|
||||
#if defined(EIGEN_HAS_INDEX_LIST)
|
||||
return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) &&
|
||||
index < m_padding[dim_index].first) ||
|
||||
(!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) &&
|
||||
index >= m_dimensions[dim_index] - m_padding[dim_index].second);
|
||||
#else
|
||||
return (index < m_padding[dim_index].first) ||
|
||||
(index >= m_dimensions[dim_index] - m_padding[dim_index].second);
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(
|
||||
int dim_index) const {
|
||||
#if defined(EIGEN_HAS_INDEX_LIST)
|
||||
return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0);
|
||||
#else
|
||||
EIGEN_UNUSED_VARIABLE(dim_index);
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(
|
||||
int dim_index) const {
|
||||
#if defined(EIGEN_HAS_INDEX_LIST)
|
||||
return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0);
|
||||
#else
|
||||
EIGEN_UNUSED_VARIABLE(dim_index);
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
|
||||
const double in = static_cast<double>(m_impl.dimensions()[i]);
|
||||
const double out = in + m_padding[i].first + m_padding[i].second;
|
||||
if (out == 0)
|
||||
return;
|
||||
const double reduction = in / out;
|
||||
cost *= reduction;
|
||||
if (first) {
|
||||
cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
|
||||
reduction * (1 * TensorOpCost::AddCost<Index>()));
|
||||
} else {
|
||||
cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
|
||||
2 * TensorOpCost::MulCost<Index>() +
|
||||
reduction * (2 * TensorOpCost::MulCost<Index>() +
|
||||
1 * TensorOpCost::DivCost<Index>()));
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
const Index initialIndex = index;
|
||||
Index inputIndex = 0;
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index first = index;
|
||||
const Index last = index + PacketSize - 1;
|
||||
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
|
||||
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
|
||||
const Index lastPaddedRight = m_outputStrides[i+1];
|
||||
|
||||
if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
|
||||
// all the coefficient are in the padding zone.
|
||||
return internal::pset1<PacketReturnType>(m_paddingValue);
|
||||
}
|
||||
else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
|
||||
// all the coefficient are in the padding zone.
|
||||
return internal::pset1<PacketReturnType>(m_paddingValue);
|
||||
}
|
||||
else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
|
||||
// all the coefficient are between the 2 padding zones.
|
||||
const Index idx = index / m_outputStrides[i];
|
||||
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
else {
|
||||
// Every other case
|
||||
return packetWithPossibleZero(initialIndex);
|
||||
}
|
||||
}
|
||||
|
||||
const Index last = index + PacketSize - 1;
|
||||
const Index first = index;
|
||||
const Index lastPaddedLeft = m_padding[0].first;
|
||||
const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
|
||||
const Index lastPaddedRight = m_outputStrides[1];
|
||||
|
||||
if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) {
|
||||
// all the coefficient are in the padding zone.
|
||||
return internal::pset1<PacketReturnType>(m_paddingValue);
|
||||
}
|
||||
else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) {
|
||||
// all the coefficient are in the padding zone.
|
||||
return internal::pset1<PacketReturnType>(m_paddingValue);
|
||||
}
|
||||
else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
|
||||
// all the coefficient are between the 2 padding zones.
|
||||
inputIndex += (index - m_padding[0].first);
|
||||
return m_impl.template packet<Unaligned>(inputIndex);
|
||||
}
|
||||
// Every other case
|
||||
return packetWithPossibleZero(initialIndex);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
const Index initialIndex = index;
|
||||
Index inputIndex = 0;
|
||||
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const Index first = index;
|
||||
const Index last = index + PacketSize - 1;
|
||||
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
|
||||
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
|
||||
const Index lastPaddedRight = m_outputStrides[i];
|
||||
|
||||
if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
|
||||
// all the coefficient are in the padding zone.
|
||||
return internal::pset1<PacketReturnType>(m_paddingValue);
|
||||
}
|
||||
else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
|
||||
// all the coefficient are in the padding zone.
|
||||
return internal::pset1<PacketReturnType>(m_paddingValue);
|
||||
}
|
||||
else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
|
||||
// all the coefficient are between the 2 padding zones.
|
||||
const Index idx = index / m_outputStrides[i+1];
|
||||
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
|
||||
index -= idx * m_outputStrides[i+1];
|
||||
}
|
||||
else {
|
||||
// Every other case
|
||||
return packetWithPossibleZero(initialIndex);
|
||||
}
|
||||
}
|
||||
|
||||
const Index last = index + PacketSize - 1;
|
||||
const Index first = index;
|
||||
const Index lastPaddedLeft = m_padding[NumDims-1].first;
|
||||
const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
|
||||
const Index lastPaddedRight = m_outputStrides[NumDims-1];
|
||||
|
||||
if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) {
|
||||
// all the coefficient are in the padding zone.
|
||||
return internal::pset1<PacketReturnType>(m_paddingValue);
|
||||
}
|
||||
else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) {
|
||||
// all the coefficient are in the padding zone.
|
||||
return internal::pset1<PacketReturnType>(m_paddingValue);
|
||||
}
|
||||
else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
|
||||
// all the coefficient are between the 2 padding zones.
|
||||
inputIndex += (index - m_padding[NumDims-1].first);
|
||||
return m_impl.template packet<Unaligned>(inputIndex);
|
||||
}
|
||||
// Every other case
|
||||
return packetWithPossibleZero(initialIndex);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
||||
{
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
|
||||
Dimensions m_dimensions;
|
||||
array<Index, NumDims+1> m_outputStrides;
|
||||
array<Index, NumDims> m_inputStrides;
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
PaddingDimensions m_padding;
|
||||
|
||||
Scalar m_paddingValue;
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
|
|
@ -0,0 +1,269 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorPatch
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor patch class.
|
||||
*
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename PatchDim, typename XprType>
|
||||
struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions + 1;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<typename PatchDim, typename XprType>
|
||||
struct eval<TensorPatchOp<PatchDim, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorPatchOp<PatchDim, XprType>& type;
|
||||
};
|
||||
|
||||
template<typename PatchDim, typename XprType>
|
||||
struct nested<TensorPatchOp<PatchDim, XprType>, 1, typename eval<TensorPatchOp<PatchDim, XprType> >::type>
|
||||
{
|
||||
typedef TensorPatchOp<PatchDim, XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename PatchDim, typename XprType>
|
||||
class TensorPatchOp : public TensorBase<TensorPatchOp<PatchDim, XprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorPatchOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorPatchOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorPatchOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorPatchOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims)
|
||||
: m_xpr(expr), m_patch_dims(patch_dims) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const PatchDim& patch_dims() const { return m_patch_dims; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const PatchDim m_patch_dims;
|
||||
};
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename PatchDim, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
||||
{
|
||||
typedef TensorPatchOp<PatchDim, ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device)
|
||||
{
|
||||
Index num_patches = 1;
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
const PatchDim& patch_dims = op.patch_dims();
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = 0; i < NumDims-1; ++i) {
|
||||
m_dimensions[i] = patch_dims[i];
|
||||
num_patches *= (input_dims[i] - patch_dims[i] + 1);
|
||||
}
|
||||
m_dimensions[NumDims-1] = num_patches;
|
||||
|
||||
m_inputStrides[0] = 1;
|
||||
m_patchStrides[0] = 1;
|
||||
for (int i = 1; i < NumDims-1; ++i) {
|
||||
m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
|
||||
m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1);
|
||||
}
|
||||
m_outputStrides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < NumDims-1; ++i) {
|
||||
m_dimensions[i+1] = patch_dims[i];
|
||||
num_patches *= (input_dims[i] - patch_dims[i] + 1);
|
||||
}
|
||||
m_dimensions[0] = num_patches;
|
||||
|
||||
m_inputStrides[NumDims-2] = 1;
|
||||
m_patchStrides[NumDims-2] = 1;
|
||||
for (int i = NumDims-3; i >= 0; --i) {
|
||||
m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
|
||||
m_patchStrides[i] = m_patchStrides[i+1] * (input_dims[i+1] - patch_dims[i+1] + 1);
|
||||
}
|
||||
m_outputStrides[NumDims-1] = 1;
|
||||
for (int i = NumDims-2; i >= 0; --i) {
|
||||
m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
|
||||
// Find the location of the first element of the patch.
|
||||
Index patchIndex = index / m_outputStrides[output_stride_index];
|
||||
// Find the offset of the element wrt the location of the first element.
|
||||
Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index];
|
||||
Index inputIndex = 0;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 2; i > 0; --i) {
|
||||
const Index patchIdx = patchIndex / m_patchStrides[i];
|
||||
patchIndex -= patchIdx * m_patchStrides[i];
|
||||
const Index offsetIdx = patchOffset / m_outputStrides[i];
|
||||
patchOffset -= offsetIdx * m_outputStrides[i];
|
||||
inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < NumDims - 2; ++i) {
|
||||
const Index patchIdx = patchIndex / m_patchStrides[i];
|
||||
patchIndex -= patchIdx * m_patchStrides[i];
|
||||
const Index offsetIdx = patchOffset / m_outputStrides[i+1];
|
||||
patchOffset -= offsetIdx * m_outputStrides[i+1];
|
||||
inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
|
||||
}
|
||||
}
|
||||
inputIndex += (patchIndex + patchOffset);
|
||||
return m_impl.coeff(inputIndex);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
|
||||
Index indices[2] = {index, index + PacketSize - 1};
|
||||
Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index],
|
||||
indices[1] / m_outputStrides[output_stride_index]};
|
||||
Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index],
|
||||
indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]};
|
||||
|
||||
Index inputIndices[2] = {0, 0};
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 2; i > 0; --i) {
|
||||
const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
|
||||
patchIndices[1] / m_patchStrides[i]};
|
||||
patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
|
||||
patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
|
||||
|
||||
const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i],
|
||||
patchOffsets[1] / m_outputStrides[i]};
|
||||
patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i];
|
||||
patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i];
|
||||
|
||||
inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
|
||||
inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < NumDims - 2; ++i) {
|
||||
const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
|
||||
patchIndices[1] / m_patchStrides[i]};
|
||||
patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
|
||||
patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
|
||||
|
||||
const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i+1],
|
||||
patchOffsets[1] / m_outputStrides[i+1]};
|
||||
patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i+1];
|
||||
patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i+1];
|
||||
|
||||
inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
|
||||
inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
|
||||
}
|
||||
}
|
||||
inputIndices[0] += (patchIndices[0] + patchOffsets[0]);
|
||||
inputIndices[1] += (patchIndices[1] + patchOffsets[1]);
|
||||
|
||||
if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
|
||||
PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
|
||||
return rslt;
|
||||
}
|
||||
else {
|
||||
EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
|
||||
values[0] = m_impl.coeff(inputIndices[0]);
|
||||
values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
|
||||
for (int i = 1; i < PacketSize-1; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost = NumDims * (TensorOpCost::DivCost<Index>() +
|
||||
TensorOpCost::MulCost<Index>() +
|
||||
2 * TensorOpCost::AddCost<Index>());
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
Dimensions m_dimensions;
|
||||
array<Index, NumDims> m_outputStrides;
|
||||
array<Index, NumDims-1> m_inputStrides;
|
||||
array<Index, NumDims-1> m_patchStrides;
|
||||
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
|
|
@ -0,0 +1,276 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
namespace {
|
||||
|
||||
EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
|
||||
#ifdef __CUDA_ARCH__
|
||||
// We don't support 3d kernels since we currently only use 1 and
|
||||
// 2d kernels.
|
||||
assert(threadIdx.z == 0);
|
||||
return clock64() +
|
||||
blockIdx.x * blockDim.x + threadIdx.x +
|
||||
gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
|
||||
|
||||
#elif defined _WIN32
|
||||
// Use the current time as a baseline.
|
||||
SYSTEMTIME st;
|
||||
GetSystemTime(&st);
|
||||
int time = st.wSecond + 1000 * st.wMilliseconds;
|
||||
// Mix in a random number to make sure that we get different seeds if
|
||||
// we try to generate seeds faster than the clock resolution.
|
||||
// We need 2 random values since the generator only generate 16 bits at
|
||||
// a time (xxxps://msdn.microsoft.com/en-us/library/398ax69y.aspx)
|
||||
int rnd1 = ::rand();
|
||||
int rnd2 = ::rand();
|
||||
uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
|
||||
return rnd;
|
||||
|
||||
#elif defined __APPLE__
|
||||
// Same approach as for win32, except that the random number generator
|
||||
// is better (// xxxps://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
|
||||
uint64_t rnd = ::random() ^ mach_absolute_time();
|
||||
return rnd;
|
||||
|
||||
#else
|
||||
// Augment the current time with pseudo random number generation
|
||||
// to ensure that we get different seeds if we try to generate seeds
|
||||
// faster than the clock resolution.
|
||||
timespec ts;
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
uint64_t rnd = ::random() ^ ts.tv_nsec;
|
||||
return rnd;
|
||||
#endif
|
||||
}
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
|
||||
// TODO: Unify with the implementation in the non blocking thread pool.
|
||||
uint64_t current = *state;
|
||||
// Update the internal state
|
||||
*state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
|
||||
// Generate the random output (using the PCG-XSH-RS scheme)
|
||||
return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
|
||||
}
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
|
||||
seed = seed ? seed : get_random_seed();
|
||||
return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
T RandomToTypeUniform(uint64_t* state) {
|
||||
unsigned rnd = PCG_XSH_RS_generator(state);
|
||||
return static_cast<T>(rnd);
|
||||
}
|
||||
|
||||
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
|
||||
Eigen::half result;
|
||||
// Generate 10 random bits for the mantissa
|
||||
unsigned rnd = PCG_XSH_RS_generator(state);
|
||||
result.x = static_cast<uint16_t>(rnd & 0x3ffu);
|
||||
// Set the exponent
|
||||
result.x |= (static_cast<uint16_t>(15) << 10);
|
||||
// Return the final result
|
||||
return result - Eigen::half(1.0f);
|
||||
}
|
||||
|
||||
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
float RandomToTypeUniform<float>(uint64_t* state) {
|
||||
typedef union {
|
||||
uint32_t raw;
|
||||
float fp;
|
||||
} internal;
|
||||
internal result;
|
||||
// Generate 23 random bits for the mantissa mantissa
|
||||
const unsigned rnd = PCG_XSH_RS_generator(state);
|
||||
result.raw = rnd & 0x7fffffu;
|
||||
// Set the exponent
|
||||
result.raw |= (static_cast<uint32_t>(127) << 23);
|
||||
// Return the final result
|
||||
return result.fp - 1.0f;
|
||||
}
|
||||
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
double RandomToTypeUniform<double>(uint64_t* state) {
|
||||
typedef union {
|
||||
uint64_t raw;
|
||||
double dp;
|
||||
} internal;
|
||||
internal result;
|
||||
result.raw = 0;
|
||||
// Generate 52 random bits for the mantissa
|
||||
// First generate the upper 20 bits
|
||||
unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
|
||||
// The generate the lower 32 bits
|
||||
unsigned rnd2 = PCG_XSH_RS_generator(state);
|
||||
result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
|
||||
// Set the exponent
|
||||
result.raw |= (static_cast<uint64_t>(1023) << 52);
|
||||
// Return the final result
|
||||
return result.dp - 1.0;
|
||||
}
|
||||
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
|
||||
return std::complex<float>(RandomToTypeUniform<float>(state),
|
||||
RandomToTypeUniform<float>(state));
|
||||
}
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
|
||||
return std::complex<double>(RandomToTypeUniform<double>(state),
|
||||
RandomToTypeUniform<double>(state));
|
||||
}
|
||||
|
||||
template <typename T> class UniformRandomGenerator {
|
||||
public:
|
||||
static const bool PacketAccess = true;
|
||||
|
||||
// Uses the given "seed" if non-zero, otherwise uses a random seed.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
|
||||
uint64_t seed = 0) {
|
||||
m_state = PCG_XSH_RS_state(seed);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
|
||||
const UniformRandomGenerator& other) {
|
||||
m_state = other.m_state;
|
||||
}
|
||||
|
||||
template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
T operator()(Index i) const {
|
||||
uint64_t local_state = m_state + i;
|
||||
T result = RandomToTypeUniform<T>(&local_state);
|
||||
m_state = local_state;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Packet packetOp(Index i) const {
|
||||
const int packetSize = internal::unpacket_traits<Packet>::size;
|
||||
EIGEN_ALIGN_MAX T values[packetSize];
|
||||
uint64_t local_state = m_state + i;
|
||||
for (int j = 0; j < packetSize; ++j) {
|
||||
values[j] = RandomToTypeUniform<T>(&local_state);
|
||||
}
|
||||
m_state = local_state;
|
||||
return internal::pload<Packet>(values);
|
||||
}
|
||||
|
||||
private:
|
||||
mutable uint64_t m_state;
|
||||
};
|
||||
|
||||
template <typename Scalar>
|
||||
struct functor_traits<UniformRandomGenerator<Scalar> > {
|
||||
enum {
|
||||
// Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
|
||||
Cost = 12 * NumTraits<Scalar>::AddCost *
|
||||
((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
|
||||
PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
|
||||
template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
T RandomToTypeNormal(uint64_t* state) {
|
||||
// Use the ratio of uniform method to generate numbers following a normal
|
||||
// distribution. See for example Numerical Recipes chapter 7.3.9 for the
|
||||
// details.
|
||||
T u, v, q;
|
||||
do {
|
||||
u = RandomToTypeUniform<T>(state);
|
||||
v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
|
||||
const T x = u - T(0.449871);
|
||||
const T y = numext::abs(v) + T(0.386595);
|
||||
q = x*x + y * (T(0.196)*y - T(0.25472)*x);
|
||||
} while (q > T(0.27597) &&
|
||||
(q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
|
||||
|
||||
return v/u;
|
||||
}
|
||||
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
|
||||
return std::complex<float>(RandomToTypeNormal<float>(state),
|
||||
RandomToTypeNormal<float>(state));
|
||||
}
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
|
||||
return std::complex<double>(RandomToTypeNormal<double>(state),
|
||||
RandomToTypeNormal<double>(state));
|
||||
}
|
||||
|
||||
|
||||
template <typename T> class NormalRandomGenerator {
|
||||
public:
|
||||
static const bool PacketAccess = true;
|
||||
|
||||
// Uses the given "seed" if non-zero, otherwise uses a random seed.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
|
||||
m_state = PCG_XSH_RS_state(seed);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
|
||||
const NormalRandomGenerator& other) {
|
||||
m_state = other.m_state;
|
||||
}
|
||||
|
||||
template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
T operator()(Index i) const {
|
||||
uint64_t local_state = m_state + i;
|
||||
T result = RandomToTypeNormal<T>(&local_state);
|
||||
m_state = local_state;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Packet packetOp(Index i) const {
|
||||
const int packetSize = internal::unpacket_traits<Packet>::size;
|
||||
EIGEN_ALIGN_MAX T values[packetSize];
|
||||
uint64_t local_state = m_state + i;
|
||||
for (int j = 0; j < packetSize; ++j) {
|
||||
values[j] = RandomToTypeNormal<T>(&local_state);
|
||||
}
|
||||
m_state = local_state;
|
||||
return internal::pload<Packet>(values);
|
||||
}
|
||||
|
||||
private:
|
||||
mutable uint64_t m_state;
|
||||
};
|
||||
|
||||
|
||||
template <typename Scalar>
|
||||
struct functor_traits<NormalRandomGenerator<Scalar> > {
|
||||
enum {
|
||||
// On average, we need to generate about 3 random numbers
|
||||
// 15 mul, 8 add, 1.5 logs
|
||||
Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
|
||||
15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
|
||||
3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
|
||||
PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
|
|
@ -0,0 +1,781 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorReduction
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor reduction class.
|
||||
*
|
||||
*/
|
||||
|
||||
namespace internal {
|
||||
template<typename Op, typename Dims, typename XprType,template <class> class MakePointer_ >
|
||||
struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> >
|
||||
: traits<XprType>
|
||||
{
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::Scalar Scalar;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
|
||||
template <class T> struct MakePointer {
|
||||
// Intermediate typedef to workaround MSVC issue.
|
||||
typedef MakePointer_<T> MakePointerT;
|
||||
typedef typename MakePointerT::Type Type;
|
||||
};
|
||||
};
|
||||
|
||||
template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
|
||||
struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type;
|
||||
};
|
||||
|
||||
template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
|
||||
struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type>
|
||||
{
|
||||
typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type;
|
||||
};
|
||||
|
||||
|
||||
template <typename OutputDims> struct DimInitializer {
|
||||
template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
|
||||
static void run(const InputDims& input_dims,
|
||||
const array<bool, internal::array_size<InputDims>::value>& reduced,
|
||||
OutputDims* output_dims, ReducedDims* reduced_dims) {
|
||||
const int NumInputDims = internal::array_size<InputDims>::value;
|
||||
int outputIndex = 0;
|
||||
int reduceIndex = 0;
|
||||
for (int i = 0; i < NumInputDims; ++i) {
|
||||
if (reduced[i]) {
|
||||
(*reduced_dims)[reduceIndex] = input_dims[i];
|
||||
++reduceIndex;
|
||||
} else {
|
||||
(*output_dims)[outputIndex] = input_dims[i];
|
||||
++outputIndex;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct DimInitializer<Sizes<> > {
|
||||
template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
|
||||
static void run(const InputDims& input_dims, const array<bool, Rank>&,
|
||||
Sizes<>*, array<Index, Rank>* reduced_dims) {
|
||||
const int NumInputDims = internal::array_size<InputDims>::value;
|
||||
for (int i = 0; i < NumInputDims; ++i) {
|
||||
(*reduced_dims)[i] = input_dims[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename ReducedDims, int NumTensorDims, int Layout>
|
||||
struct are_inner_most_dims {
|
||||
static const bool value = false;
|
||||
};
|
||||
template <typename ReducedDims, int NumTensorDims, int Layout>
|
||||
struct preserve_inner_most_dims {
|
||||
static const bool value = false;
|
||||
};
|
||||
|
||||
#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template <typename ReducedDims, int NumTensorDims>
|
||||
struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
|
||||
static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
|
||||
static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
|
||||
static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
|
||||
static const bool value = tmp1 & tmp2 & tmp3;
|
||||
};
|
||||
template <typename ReducedDims, int NumTensorDims>
|
||||
struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
|
||||
static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
|
||||
static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
|
||||
static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
|
||||
static const bool value = tmp1 & tmp2 & tmp3;
|
||||
|
||||
};
|
||||
template <typename ReducedDims, int NumTensorDims>
|
||||
struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
|
||||
static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
|
||||
static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
|
||||
static const bool value = tmp1 & tmp2;
|
||||
|
||||
};
|
||||
template <typename ReducedDims, int NumTensorDims>
|
||||
struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
|
||||
static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
|
||||
static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
|
||||
static const bool value = tmp1 & tmp2;
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
template <int DimIndex, typename Self, typename Op>
|
||||
struct GenericDimReducer {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
|
||||
EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
|
||||
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
|
||||
GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
|
||||
}
|
||||
}
|
||||
};
|
||||
template <typename Self, typename Op>
|
||||
struct GenericDimReducer<0, Self, Op> {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
|
||||
for (int j = 0; j < self.m_reducedDims[0]; ++j) {
|
||||
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
|
||||
reducer.reduce(self.m_impl.coeff(input), accum);
|
||||
}
|
||||
}
|
||||
};
|
||||
template <typename Self, typename Op>
|
||||
struct GenericDimReducer<-1, Self, Op> {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) {
|
||||
reducer.reduce(self.m_impl.coeff(index), accum);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
|
||||
struct InnerMostDimReducer {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
|
||||
typename Self::CoeffReturnType accum = reducer.initialize();
|
||||
for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
|
||||
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
|
||||
}
|
||||
return reducer.finalize(accum);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct InnerMostDimReducer<Self, Op, true> {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
|
||||
const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
|
||||
const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
|
||||
typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
|
||||
for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
|
||||
reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
|
||||
}
|
||||
typename Self::CoeffReturnType accum = reducer.initialize();
|
||||
for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
|
||||
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
|
||||
}
|
||||
return reducer.finalizeBoth(accum, p);
|
||||
}
|
||||
};
|
||||
|
||||
template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
|
||||
struct InnerMostDimPreserver {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
|
||||
eigen_assert(false && "should never be called");
|
||||
}
|
||||
};
|
||||
|
||||
template <int DimIndex, typename Self, typename Op>
|
||||
struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
|
||||
EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
|
||||
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
|
||||
InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct InnerMostDimPreserver<0, Self, Op, true> {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
|
||||
for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
|
||||
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
|
||||
reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
|
||||
}
|
||||
}
|
||||
};
|
||||
template <typename Self, typename Op>
|
||||
struct InnerMostDimPreserver<-1, Self, Op, true> {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
|
||||
eigen_assert(false && "should never be called");
|
||||
}
|
||||
};
|
||||
|
||||
// Default full reducer
|
||||
template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
|
||||
struct FullReducer {
|
||||
static const bool HasOptimizedImplementation = false;
|
||||
|
||||
static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
|
||||
const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
|
||||
*output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#ifdef EIGEN_USE_THREADS
|
||||
// Multithreaded full reducers
|
||||
template <typename Self, typename Op,
|
||||
bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
|
||||
struct FullReducerShard {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
|
||||
typename Self::Index numValuesToReduce, Op& reducer,
|
||||
typename Self::CoeffReturnType* output) {
|
||||
*output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
|
||||
self, firstIndex, numValuesToReduce, reducer);
|
||||
}
|
||||
};
|
||||
|
||||
// Multithreaded full reducer
|
||||
template <typename Self, typename Op, bool Vectorizable>
|
||||
struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful;
|
||||
static const int PacketSize =
|
||||
unpacket_traits<typename Self::PacketReturnType>::size;
|
||||
|
||||
// launch one reducer per thread and accumulate the result.
|
||||
static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
|
||||
typename Self::CoeffReturnType* output) {
|
||||
typedef typename Self::Index Index;
|
||||
const Index num_coeffs = array_prod(self.m_impl.dimensions());
|
||||
if (num_coeffs == 0) {
|
||||
*output = reducer.finalize(reducer.initialize());
|
||||
return;
|
||||
}
|
||||
const TensorOpCost cost =
|
||||
self.m_impl.costPerCoeff(Vectorizable) +
|
||||
TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
|
||||
PacketSize);
|
||||
const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
|
||||
num_coeffs, cost, device.numThreads());
|
||||
if (num_threads == 1) {
|
||||
*output =
|
||||
InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
|
||||
return;
|
||||
}
|
||||
const Index blocksize =
|
||||
std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
|
||||
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
|
||||
eigen_assert(num_coeffs >= numblocks * blocksize);
|
||||
|
||||
Barrier barrier(internal::convert_index<unsigned int>(numblocks));
|
||||
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
|
||||
for (Index i = 0; i < numblocks; ++i) {
|
||||
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
|
||||
self, i * blocksize, blocksize, reducer,
|
||||
&shards[i]);
|
||||
}
|
||||
typename Self::CoeffReturnType finalShard;
|
||||
if (numblocks * blocksize < num_coeffs) {
|
||||
finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
|
||||
self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
|
||||
reducer);
|
||||
} else {
|
||||
finalShard = reducer.initialize();
|
||||
}
|
||||
barrier.Wait();
|
||||
|
||||
for (Index i = 0; i < numblocks; ++i) {
|
||||
reducer.reduce(shards[i], &finalShard);
|
||||
}
|
||||
*output = reducer.finalize(finalShard);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// Default inner reducer
|
||||
template <typename Self, typename Op, typename Device>
|
||||
struct InnerReducer {
|
||||
static const bool HasOptimizedImplementation = false;
|
||||
|
||||
EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
|
||||
eigen_assert(false && "Not implemented");
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
// Default outer reducer
|
||||
template <typename Self, typename Op, typename Device>
|
||||
struct OuterReducer {
|
||||
static const bool HasOptimizedImplementation = false;
|
||||
|
||||
EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
|
||||
eigen_assert(false && "Not implemented");
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
template <int B, int N, typename S, typename R, typename I>
|
||||
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
|
||||
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
template <typename S, typename R, typename I>
|
||||
__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
|
||||
template <int B, int N, typename S, typename R, typename I>
|
||||
__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
|
||||
template <int NPT, typename S, typename R, typename I>
|
||||
__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*);
|
||||
|
||||
#endif
|
||||
|
||||
template <int NPT, typename S, typename R, typename I>
|
||||
__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
||||
|
||||
template <int NPT, typename S, typename R, typename I>
|
||||
__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
|
||||
class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
|
||||
{ }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
|
||||
{ }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const XprType& expression() const { return m_expr; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const Dims& dims() const { return m_dims; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const Op& reducer() const { return m_reducer; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_expr;
|
||||
const Dims m_dims;
|
||||
const Op m_reducer;
|
||||
};
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
|
||||
struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
|
||||
{
|
||||
typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef ArgType ChildType;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
|
||||
static const int NumInputDims = internal::array_size<InputDimensions>::value;
|
||||
static const int NumReducedDims = internal::array_size<Dims>::value;
|
||||
static const int NumOutputDims = NumInputDims - NumReducedDims;
|
||||
typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
|
||||
static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
|
||||
static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
|
||||
static const bool RunningFullReduction = (NumOutputDims==0);
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims())
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
|
||||
YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
|
||||
// Build the bitmap indicating if an input dimension is reduced or not.
|
||||
for (int i = 0; i < NumInputDims; ++i) {
|
||||
m_reduced[i] = false;
|
||||
}
|
||||
for (int i = 0; i < NumReducedDims; ++i) {
|
||||
eigen_assert(op.dims()[i] >= 0);
|
||||
eigen_assert(op.dims()[i] < NumInputDims);
|
||||
m_reduced[op.dims()[i]] = true;
|
||||
}
|
||||
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
|
||||
|
||||
// Precompute output strides.
|
||||
if (NumOutputDims > 0) {
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_outputStrides[0] = 1;
|
||||
for (int i = 1; i < NumOutputDims; ++i) {
|
||||
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
|
||||
}
|
||||
} else {
|
||||
m_outputStrides.back() = 1;
|
||||
for (int i = NumOutputDims - 2; i >= 0; --i) {
|
||||
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Precompute input strides.
|
||||
if (NumInputDims > 0) {
|
||||
array<Index, NumInputDims> input_strides;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
input_strides[0] = 1;
|
||||
for (int i = 1; i < NumInputDims; ++i) {
|
||||
input_strides[i] = input_strides[i-1] * input_dims[i-1];
|
||||
}
|
||||
} else {
|
||||
input_strides.back() = 1;
|
||||
for (int i = NumInputDims - 2; i >= 0; --i) {
|
||||
input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
int outputIndex = 0;
|
||||
int reduceIndex = 0;
|
||||
for (int i = 0; i < NumInputDims; ++i) {
|
||||
if (m_reduced[i]) {
|
||||
m_reducedStrides[reduceIndex] = input_strides[i];
|
||||
++reduceIndex;
|
||||
} else {
|
||||
m_preservedStrides[outputIndex] = input_strides[i];
|
||||
++outputIndex;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Special case for full reductions
|
||||
if (NumOutputDims == 0) {
|
||||
m_preservedStrides[0] = internal::array_prod(input_dims);
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
|
||||
// Use the FullReducer if possible.
|
||||
if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
|
||||
internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
|
||||
((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
|
||||
!RunningOnGPU))) {
|
||||
bool need_assign = false;
|
||||
if (!data) {
|
||||
m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
|
||||
data = m_result;
|
||||
need_assign = true;
|
||||
}
|
||||
Op reducer(m_reducer);
|
||||
internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
|
||||
return need_assign;
|
||||
}
|
||||
else if(RunningOnSycl){
|
||||
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
|
||||
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
|
||||
if (!data) {
|
||||
data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
|
||||
m_result = data;
|
||||
}
|
||||
Op reducer(m_reducer);
|
||||
internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
|
||||
return (m_result != NULL);
|
||||
}
|
||||
|
||||
// Attempt to use an optimized reduction.
|
||||
else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
|
||||
bool reducing_inner_dims = true;
|
||||
for (int i = 0; i < NumReducedDims; ++i) {
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
reducing_inner_dims &= m_reduced[i];
|
||||
} else {
|
||||
reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
|
||||
}
|
||||
}
|
||||
if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
|
||||
(reducing_inner_dims || ReducingInnerMostDims)) {
|
||||
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
|
||||
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
|
||||
if (!data) {
|
||||
if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
|
||||
data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
|
||||
m_result = data;
|
||||
}
|
||||
else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
Op reducer(m_reducer);
|
||||
if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
|
||||
if (m_result) {
|
||||
m_device.deallocate(m_result);
|
||||
m_result = NULL;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return (m_result != NULL);
|
||||
}
|
||||
}
|
||||
|
||||
bool preserving_inner_dims = true;
|
||||
for (int i = 0; i < NumReducedDims; ++i) {
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
|
||||
} else {
|
||||
preserving_inner_dims &= m_reduced[i];
|
||||
}
|
||||
}
|
||||
if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation &&
|
||||
preserving_inner_dims) {
|
||||
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
|
||||
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
|
||||
if (!data) {
|
||||
if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
|
||||
data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
|
||||
m_result = data;
|
||||
}
|
||||
else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
Op reducer(m_reducer);
|
||||
if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
|
||||
if (m_result) {
|
||||
m_device.deallocate(m_result);
|
||||
m_result = NULL;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return (m_result != NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
if (m_result) {
|
||||
m_device.deallocate(m_result);
|
||||
m_result = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) {
|
||||
return *(m_result + index);
|
||||
}
|
||||
Op reducer(m_reducer);
|
||||
if (ReducingInnerMostDims || RunningFullReduction) {
|
||||
const Index num_values_to_reduce =
|
||||
(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
|
||||
return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
|
||||
num_values_to_reduce, reducer);
|
||||
} else {
|
||||
typename Self::CoeffReturnType accum = reducer.initialize();
|
||||
internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
|
||||
return reducer.finalize(accum);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(bsteiner): provide a more efficient implementation.
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
|
||||
|
||||
if (RunningOnGPU && m_result) {
|
||||
return internal::pload<PacketReturnType>(m_result + index);
|
||||
}
|
||||
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
if (ReducingInnerMostDims) {
|
||||
const Index num_values_to_reduce =
|
||||
(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
|
||||
const Index firstIndex = firstInput(index);
|
||||
for (Index i = 0; i < PacketSize; ++i) {
|
||||
Op reducer(m_reducer);
|
||||
values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
|
||||
num_values_to_reduce, reducer);
|
||||
}
|
||||
} else if (PreservingInnerMostDims) {
|
||||
const Index firstIndex = firstInput(index);
|
||||
const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
|
||||
// TBD: extend this the the n innermost dimensions that we preserve.
|
||||
if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
|
||||
Op reducer(m_reducer);
|
||||
typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
|
||||
internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
|
||||
return reducer.finalizePacket(accum);
|
||||
} else {
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index + i);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index + i);
|
||||
}
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
|
||||
// Must be called after evalSubExprsIfNeeded().
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
if (RunningFullReduction && m_result) {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
} else {
|
||||
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
|
||||
const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
|
||||
return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
|
||||
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC typename MakePointer_<Scalar>::Type data() const { return m_result; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
/// added for sycl in order to construct the buffer from the sycl device
|
||||
const Device& device() const{return m_device;}
|
||||
/// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel
|
||||
const Dims& xprDims() const {return m_xpr_dims;}
|
||||
|
||||
|
||||
private:
|
||||
template <int, typename, typename> friend struct internal::GenericDimReducer;
|
||||
template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
|
||||
template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
|
||||
template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
|
||||
#ifdef EIGEN_USE_THREADS
|
||||
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
|
||||
#endif
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
|
||||
template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
|
||||
template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
|
||||
#endif
|
||||
template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
||||
|
||||
template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
||||
#endif
|
||||
|
||||
template <typename S, typename O, typename D> friend struct internal::InnerReducer;
|
||||
|
||||
// Returns the Index in the input tensor of the first value that needs to be
|
||||
// used to compute the reduction at output index "index".
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
|
||||
if (ReducingInnerMostDims) {
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
return index * m_preservedStrides[0];
|
||||
} else {
|
||||
return index * m_preservedStrides[NumPreservedStrides - 1];
|
||||
}
|
||||
}
|
||||
// TBD: optimize the case where we preserve the innermost dimensions.
|
||||
Index startInput = 0;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumOutputDims - 1; i > 0; --i) {
|
||||
// This is index_i in the output tensor.
|
||||
const Index idx = index / m_outputStrides[i];
|
||||
startInput += idx * m_preservedStrides[i];
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
if (PreservingInnerMostDims) {
|
||||
eigen_assert(m_preservedStrides[0] == 1);
|
||||
startInput += index;
|
||||
} else {
|
||||
startInput += index * m_preservedStrides[0];
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < NumOutputDims - 1; ++i) {
|
||||
// This is index_i in the output tensor.
|
||||
const Index idx = index / m_outputStrides[i];
|
||||
startInput += idx * m_preservedStrides[i];
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
if (PreservingInnerMostDims) {
|
||||
eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
|
||||
startInput += index;
|
||||
} else {
|
||||
startInput += index * m_preservedStrides[NumPreservedStrides - 1];
|
||||
}
|
||||
}
|
||||
return startInput;
|
||||
}
|
||||
|
||||
// Bitmap indicating if an input dimension is reduced or not.
|
||||
array<bool, NumInputDims> m_reduced;
|
||||
// Dimensions of the output of the operation.
|
||||
Dimensions m_dimensions;
|
||||
// Precomputed strides for the output tensor.
|
||||
array<Index, NumOutputDims> m_outputStrides;
|
||||
// Subset of strides of the input tensor for the non-reduced dimensions.
|
||||
// Indexed by output dimensions.
|
||||
static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
|
||||
array<Index, NumPreservedStrides> m_preservedStrides;
|
||||
|
||||
// Subset of strides of the input tensor for the reduced dimensions.
|
||||
// Indexed by reduced dimensions.
|
||||
array<Index, NumReducedDims> m_reducedStrides;
|
||||
// Size of the input dimensions that are reduced.
|
||||
// Indexed by reduced dimensions.
|
||||
array<Index, NumReducedDims> m_reducedDims;
|
||||
|
||||
// Evaluator for the input expression.
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
|
||||
// Operation to apply for computing the reduction.
|
||||
Op m_reducer;
|
||||
|
||||
// For full reductions
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
|
||||
static const bool RunningOnSycl = false;
|
||||
#elif defined(EIGEN_USE_SYCL)
|
||||
static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
|
||||
static const bool RunningOnGPU = false;
|
||||
#else
|
||||
static const bool RunningOnGPU = false;
|
||||
static const bool RunningOnSycl = false;
|
||||
#endif
|
||||
typename MakePointer_<CoeffReturnType>::Type m_result;
|
||||
|
||||
const Device& m_device;
|
||||
const Dims& m_xpr_dims;
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
|
|
@ -0,0 +1,750 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
// Full reducers for GPU, don't vectorize for now
|
||||
|
||||
// Reducer function that enables multiple cuda thread to safely accumulate at the same
|
||||
// output address. It basically reads the current value of the output variable, and
|
||||
// attempts to update it with the new value. If in the meantime another cuda thread
|
||||
// updated the content of the output address it will try again.
|
||||
template <typename T, typename R>
|
||||
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
if (sizeof(T) == 4)
|
||||
{
|
||||
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
|
||||
unsigned int newval = oldval;
|
||||
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
|
||||
if (newval == oldval) {
|
||||
return;
|
||||
}
|
||||
unsigned int readback;
|
||||
while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
|
||||
oldval = readback;
|
||||
newval = oldval;
|
||||
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
|
||||
if (newval == oldval) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (sizeof(T) == 8) {
|
||||
unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
|
||||
unsigned long long newval = oldval;
|
||||
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
|
||||
if (newval == oldval) {
|
||||
return;
|
||||
}
|
||||
unsigned long long readback;
|
||||
while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
|
||||
oldval = readback;
|
||||
newval = oldval;
|
||||
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
|
||||
if (newval == oldval) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
assert(0 && "Wordsize not supported");
|
||||
}
|
||||
#else
|
||||
assert(0 && "Shouldn't be called on unsupported device");
|
||||
#endif
|
||||
}
|
||||
|
||||
// We extend atomicExch to support extra data types
|
||||
template <typename Type>
|
||||
__device__ inline Type atomicExchCustom(Type* address, Type val) {
|
||||
return atomicExch(address, val);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ inline double atomicExchCustom(double* address, double val) {
|
||||
unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
|
||||
return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
|
||||
}
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
template <template <typename T> class R>
|
||||
__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
|
||||
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
|
||||
unsigned int newval = oldval;
|
||||
reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
|
||||
if (newval == oldval) {
|
||||
return;
|
||||
}
|
||||
unsigned int readback;
|
||||
while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
|
||||
oldval = readback;
|
||||
newval = oldval;
|
||||
reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
|
||||
if (newval == oldval) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <>
|
||||
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
atomicAdd(output, accum);
|
||||
#else
|
||||
assert(0 && "Shouldn't be called on unsupported device");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template <typename CoeffType, typename Index>
|
||||
__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
|
||||
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const Index num_threads = blockDim.x * gridDim.x;
|
||||
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
|
||||
output[i] = val;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <int BlockSize, int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
|
||||
typename Self::CoeffReturnType* output, unsigned int* semaphore) {
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
// Initialize the output value
|
||||
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
|
||||
if (gridDim.x == 1) {
|
||||
if (first_index == 0) {
|
||||
*output = reducer.initialize();
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (threadIdx.x == 0) {
|
||||
unsigned int block = atomicCAS(semaphore, 0u, 1u);
|
||||
if (block == 0) {
|
||||
// We're the first block to run, initialize the output value
|
||||
atomicExchCustom(output, reducer.initialize());
|
||||
__threadfence();
|
||||
atomicExch(semaphore, 2u);
|
||||
}
|
||||
else {
|
||||
// Wait for the first block to initialize the output value.
|
||||
// Use atomicCAS here to ensure that the reads aren't cached
|
||||
unsigned int val;
|
||||
do {
|
||||
val = atomicCAS(semaphore, 2u, 2u);
|
||||
}
|
||||
while (val < 2u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
|
||||
|
||||
typename Self::CoeffReturnType accum = reducer.initialize();
|
||||
Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
|
||||
for (Index i = 0; i < max_iter; i+=BlockSize) {
|
||||
const Index index = first_index + i;
|
||||
eigen_assert(index < num_coeffs);
|
||||
typename Self::CoeffReturnType val = input.m_impl.coeff(index);
|
||||
reducer.reduce(val, &accum);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
|
||||
}
|
||||
|
||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||
atomicReduce(output, accum, reducer);
|
||||
}
|
||||
|
||||
if (gridDim.x > 1 && threadIdx.x == 0) {
|
||||
// Let the last block reset the semaphore
|
||||
atomicInc(semaphore, gridDim.x + 1);
|
||||
}
|
||||
#else
|
||||
assert(0 && "Shouldn't be called on unsupported device");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
template <typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
|
||||
eigen_assert(blockDim.x == 1);
|
||||
eigen_assert(gridDim.x == 1);
|
||||
if (num_coeffs % 2 != 0) {
|
||||
half last = input.m_impl.coeff(num_coeffs-1);
|
||||
*scratch = __halves2half2(last, reducer.initialize());
|
||||
} else {
|
||||
*scratch = reducer.template initializePacket<half2>();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
|
||||
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const Index num_threads = blockDim.x * gridDim.x;
|
||||
const Index num_packets = num_coeffs / 2;
|
||||
for (Index i = thread_id; i < num_packets; i += num_threads) {
|
||||
((half2*)output)[i] = reducer.template initializePacket<half2>();
|
||||
}
|
||||
|
||||
if (thread_id == 0 && num_coeffs % 2 != 0) {
|
||||
output[num_coeffs-1] = reducer.initialize();
|
||||
}
|
||||
}
|
||||
|
||||
template <int BlockSize, int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
|
||||
half* output, half2* scratch) {
|
||||
eigen_assert(NumPerThread % 2 == 0);
|
||||
|
||||
const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
|
||||
|
||||
// Initialize the output value if it wasn't initialized by the ReductionInitKernel
|
||||
if (gridDim.x == 1 && first_index == 0) {
|
||||
if (num_coeffs % 2 != 0) {
|
||||
half last = input.m_impl.coeff(num_coeffs-1);
|
||||
*scratch = __halves2half2(last, reducer.initialize());
|
||||
} else {
|
||||
*scratch = reducer.template initializePacket<half2>();
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
half2 accum = reducer.template initializePacket<half2>();
|
||||
const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
|
||||
for (Index i = 0; i < max_iter; i += BlockSize) {
|
||||
const Index index = first_index + 2*i;
|
||||
eigen_assert(index + 1 < num_coeffs);
|
||||
half2 val = input.m_impl.template packet<Unaligned>(index);
|
||||
reducer.reducePacket(val, &accum);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||
reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
|
||||
}
|
||||
|
||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||
atomicReduce(scratch, accum, reducer);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (gridDim.x == 1 && first_index == 0) {
|
||||
half tmp = __low2half(*scratch);
|
||||
reducer.reduce(__high2half(*scratch), &tmp);
|
||||
*output = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
|
||||
eigen_assert(threadIdx.x == 1);
|
||||
half tmp = __low2half(*scratch);
|
||||
reducer.reduce(__high2half(*scratch), &tmp);
|
||||
*output = tmp;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
|
||||
struct FullReductionLauncher {
|
||||
static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
|
||||
assert(false && "Should only be called on doubles, floats and half floats");
|
||||
}
|
||||
};
|
||||
|
||||
// Specialization for float and double
|
||||
template <typename Self, typename Op, typename OutputType, bool PacketAccess>
|
||||
struct FullReductionLauncher<
|
||||
Self, Op, OutputType, PacketAccess,
|
||||
typename internal::enable_if<
|
||||
internal::is_same<float, OutputType>::value ||
|
||||
internal::is_same<double, OutputType>::value,
|
||||
void>::type> {
|
||||
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
|
||||
typedef typename Self::Index Index;
|
||||
typedef typename Self::CoeffReturnType Scalar;
|
||||
const int block_size = 256;
|
||||
const int num_per_thread = 128;
|
||||
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||
|
||||
unsigned int* semaphore = NULL;
|
||||
if (num_blocks > 1) {
|
||||
semaphore = device.semaphore();
|
||||
}
|
||||
|
||||
LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
|
||||
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
template <typename Self, typename Op>
|
||||
struct FullReductionLauncher<Self, Op, Eigen::half, false> {
|
||||
static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
|
||||
assert(false && "Should not be called since there is no packet accessor");
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct FullReductionLauncher<Self, Op, Eigen::half, true> {
|
||||
static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
|
||||
typedef typename Self::Index Index;
|
||||
|
||||
const int block_size = 256;
|
||||
const int num_per_thread = 128;
|
||||
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||
half2* scratch = static_cast<half2*>(device.scratchpad());
|
||||
|
||||
if (num_blocks > 1) {
|
||||
// We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
|
||||
// won't be a race conditions between multiple thread blocks.
|
||||
LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
|
||||
1, 1, 0, device, reducer, self, num_coeffs, scratch);
|
||||
}
|
||||
|
||||
LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
|
||||
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
|
||||
|
||||
if (num_blocks > 1) {
|
||||
LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
|
||||
1, 1, 0, device, reducer, output, scratch);
|
||||
}
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
template <typename Self, typename Op, bool Vectorizable>
|
||||
struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
|
||||
// Unfortunately nvidia doesn't support well exotic types such as complex,
|
||||
// so reduce the scope of the optimized version of the code to the simple cases
|
||||
// of doubles, floats and half floats
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
||||
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
|
||||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
|
||||
#else
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
||||
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value);
|
||||
#endif
|
||||
|
||||
template <typename OutputType>
|
||||
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
|
||||
assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
|
||||
const Index num_coeffs = array_prod(self.m_impl.dimensions());
|
||||
// Don't crash when we're called with an input tensor of size 0.
|
||||
if (num_coeffs == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
|
||||
typename Self::CoeffReturnType* output) {
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
typedef typename Self::CoeffReturnType Type;
|
||||
eigen_assert(blockDim.y == 1);
|
||||
eigen_assert(blockDim.z == 1);
|
||||
eigen_assert(gridDim.y == 1);
|
||||
eigen_assert(gridDim.z == 1);
|
||||
|
||||
const int unroll_times = 16;
|
||||
eigen_assert(NumPerThread % unroll_times == 0);
|
||||
|
||||
const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
|
||||
const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
|
||||
|
||||
const Index num_threads = blockDim.x * gridDim.x;
|
||||
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
// Initialize the output values if they weren't initialized by the ReductionInitKernel
|
||||
if (gridDim.x == 1) {
|
||||
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
|
||||
output[i] = reducer.initialize();
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
|
||||
const Index row = i / input_col_blocks;
|
||||
|
||||
if (row < num_preserved_coeffs) {
|
||||
const Index col_block = i % input_col_blocks;
|
||||
const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
|
||||
|
||||
Type reduced_val = reducer.initialize();
|
||||
|
||||
for (Index j = 0; j < NumPerThread; j += unroll_times) {
|
||||
const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
|
||||
if (last_col >= num_coeffs_to_reduce) {
|
||||
for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
|
||||
const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
|
||||
reducer.reduce(val, &reduced_val);
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
// Faster version of the loop with no branches after unrolling.
|
||||
#pragma unroll
|
||||
for (int k = 0; k < unroll_times; ++k) {
|
||||
const Index col = col_begin + blockDim.x * (j + k);
|
||||
reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||
reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
|
||||
}
|
||||
|
||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||
atomicReduce(&(output[row]), reduced_val, reducer);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
assert(0 && "Shouldn't be called on unsupported device");
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
|
||||
template <int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
|
||||
half* output) {
|
||||
eigen_assert(blockDim.y == 1);
|
||||
eigen_assert(blockDim.z == 1);
|
||||
eigen_assert(gridDim.y == 1);
|
||||
eigen_assert(gridDim.z == 1);
|
||||
|
||||
const int unroll_times = 16;
|
||||
eigen_assert(NumPerThread % unroll_times == 0);
|
||||
eigen_assert(unroll_times % 2 == 0);
|
||||
|
||||
const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
|
||||
const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
|
||||
|
||||
const Index num_threads = blockDim.x * gridDim.x;
|
||||
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
// Initialize the output values if they weren't initialized by the ReductionInitKernel
|
||||
if (gridDim.x == 1) {
|
||||
Index i = 2*thread_id;
|
||||
for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
|
||||
half* loc = output + i;
|
||||
*((half2*)loc) = reducer.template initializePacket<half2>();
|
||||
}
|
||||
if (i < num_preserved_coeffs) {
|
||||
output[i] = reducer.initialize();
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
|
||||
const Index row = 2 * (i / input_col_blocks);
|
||||
|
||||
if (row + 1 < num_preserved_coeffs) {
|
||||
const Index col_block = i % input_col_blocks;
|
||||
const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x);
|
||||
|
||||
half2 reduced_val1 = reducer.template initializePacket<half2>();
|
||||
half2 reduced_val2 = reducer.template initializePacket<half2>();
|
||||
|
||||
for (Index j = 0; j < NumPerThread; j += unroll_times) {
|
||||
const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2;
|
||||
if (last_col >= num_coeffs_to_reduce) {
|
||||
Index col = col_begin + blockDim.x * j;
|
||||
for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
|
||||
const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
|
||||
reducer.reducePacket(val1, &reduced_val1);
|
||||
const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
|
||||
reducer.reducePacket(val2, &reduced_val2);
|
||||
}
|
||||
if (col < num_coeffs_to_reduce) {
|
||||
// Peel;
|
||||
const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
|
||||
const half2 val1 = __halves2half2(last1, reducer.initialize());
|
||||
reducer.reducePacket(val1, &reduced_val1);
|
||||
const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
|
||||
const half2 val2 = __halves2half2(last2, reducer.initialize());
|
||||
reducer.reducePacket(val2, &reduced_val2);
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
// Faster version of the loop with no branches after unrolling.
|
||||
#pragma unroll
|
||||
for (int k = 0; k < unroll_times; ++k) {
|
||||
const Index col = col_begin + blockDim.x * (j + k) * 2;
|
||||
reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
|
||||
reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||
reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
|
||||
reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
|
||||
}
|
||||
|
||||
half val1 = __low2half(reduced_val1);
|
||||
reducer.reduce(__high2half(reduced_val1), &val1);
|
||||
half val2 = __low2half(reduced_val2);
|
||||
reducer.reduce(__high2half(reduced_val2), &val2);
|
||||
half2 val = __halves2half2(val1, val2);
|
||||
|
||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||
half* loc = output + row;
|
||||
atomicReduce((half2*)loc, val, reducer);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
|
||||
struct InnerReductionLauncher {
|
||||
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
|
||||
assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
// Specialization for float and double
|
||||
template <typename Self, typename Op, typename OutputType, bool PacketAccess>
|
||||
struct InnerReductionLauncher<
|
||||
Self, Op, OutputType, PacketAccess,
|
||||
typename internal::enable_if<
|
||||
internal::is_same<float, OutputType>::value ||
|
||||
internal::is_same<double, OutputType>::value,
|
||||
void>::type> {
|
||||
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
||||
typedef typename Self::Index Index;
|
||||
|
||||
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
||||
const int block_size = 256;
|
||||
const int num_per_thread = 128;
|
||||
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||
device.maxCudaThreadsPerMultiProcessor() / block_size;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
|
||||
if (num_blocks > 1) {
|
||||
// We initialize the outputs outside the reduction kernel when we can't be sure that there
|
||||
// won't be a race conditions between multiple thread blocks.
|
||||
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||
device.maxCudaThreadsPerMultiProcessor() / 1024;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
|
||||
num_blocks, 1024, 0, device, reducer.initialize(),
|
||||
num_preserved_vals, output);
|
||||
}
|
||||
|
||||
LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
|
||||
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
|
||||
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
template <typename Self, typename Op>
|
||||
struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
|
||||
static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
|
||||
assert(false && "Should not be called since there is no packet accessor");
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
|
||||
static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
||||
typedef typename Self::Index Index;
|
||||
|
||||
if (num_preserved_vals % 2 != 0) {
|
||||
// Not supported yet, revert to the slower code path
|
||||
return true;
|
||||
}
|
||||
|
||||
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
||||
const int block_size = /*256*/128;
|
||||
const int num_per_thread = /*128*/64;
|
||||
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||
device.maxCudaThreadsPerMultiProcessor() / block_size;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
|
||||
if (num_blocks > 1) {
|
||||
// We initialize the outputs outside the reduction kernel when we can't be sure that there
|
||||
// won't be a race conditions between multiple thread blocks.
|
||||
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||
device.maxCudaThreadsPerMultiProcessor() / 1024;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
|
||||
1, 1, 0, device, reducer, self, num_preserved_vals, output);
|
||||
}
|
||||
|
||||
LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
|
||||
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
|
||||
|
||||
return false;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct InnerReducer<Self, Op, GpuDevice> {
|
||||
// Unfortunately nvidia doesn't support well exotic types such as complex,
|
||||
// so reduce the scope of the optimized version of the code to the simple case
|
||||
// of floats and half floats.
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
||||
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
|
||||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
|
||||
#else
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
||||
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value);
|
||||
#endif
|
||||
|
||||
template <typename OutputType>
|
||||
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
||||
assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
|
||||
const Index num_coeffs = array_prod(self.m_impl.dimensions());
|
||||
// Don't crash when we're called with an input tensor of size 0.
|
||||
if (num_coeffs == 0) {
|
||||
return true;
|
||||
}
|
||||
// It's faster to use the usual code.
|
||||
if (num_coeffs_to_reduce <= 128) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
|
||||
}
|
||||
};
|
||||
|
||||
template <int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
|
||||
typename Self::CoeffReturnType* output) {
|
||||
const Index num_threads = blockDim.x * gridDim.x;
|
||||
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
// Initialize the output values if they weren't initialized by the ReductionInitKernel
|
||||
if (gridDim.x == 1) {
|
||||
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
|
||||
output[i] = reducer.initialize();
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Do the reduction.
|
||||
const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
|
||||
for (Index i = thread_id; i < max_iter; i += num_threads) {
|
||||
const Index input_col = i % num_preserved_coeffs;
|
||||
const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
|
||||
typename Self::CoeffReturnType reduced_val = reducer.initialize();
|
||||
const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
|
||||
for (Index j = input_row; j < max_row; j++) {
|
||||
typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
|
||||
reducer.reduce(val, &reduced_val);
|
||||
}
|
||||
atomicReduce(&(output[input_col]), reduced_val, reducer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct OuterReducer<Self, Op, GpuDevice> {
|
||||
// Unfortunately nvidia doesn't support well exotic types such as complex,
|
||||
// so reduce the scope of the optimized version of the code to the simple case
|
||||
// of floats.
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
||||
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value);
|
||||
template <typename Device, typename OutputType>
|
||||
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
|
||||
assert(false && "Should only be called to reduce doubles or floats on a gpu device");
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
||||
typedef typename Self::Index Index;
|
||||
|
||||
// It's faster to use the usual code.
|
||||
if (num_coeffs_to_reduce <= 32) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
||||
const int block_size = 256;
|
||||
const int num_per_thread = 16;
|
||||
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||
device.maxCudaThreadsPerMultiProcessor() / block_size;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
|
||||
if (num_blocks > 1) {
|
||||
// We initialize the outputs in the reduction kernel itself when we don't have to worry
|
||||
// about race conditions between multiple thread blocks.
|
||||
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||
device.maxCudaThreadsPerMultiProcessor() / 1024;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
|
||||
num_blocks, 1024, 0, device, reducer.initialize(),
|
||||
num_preserved_vals, output);
|
||||
}
|
||||
|
||||
LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
|
||||
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
|
||||
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
|
|
@ -0,0 +1,242 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
/*****************************************************************
|
||||
* TensorSyclPlaceHolderExpr.h
|
||||
*
|
||||
* \brief:
|
||||
* This is the specialisation of the placeholder expression based on the
|
||||
* operation type
|
||||
*
|
||||
*****************************************************************/
|
||||
|
||||
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
|
||||
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
|
||||
template<typename BufferTOut, typename BufferTIn>
|
||||
static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
|
||||
do {
|
||||
auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
|
||||
cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
|
||||
cl::sycl::range<1>{std::min(length, local)}};
|
||||
/* Two accessors are used: one to the buffer that is being reduced,
|
||||
* and a second to local memory, used to store intermediate data. */
|
||||
auto aI =
|
||||
bufI.template get_access<cl::sycl::access::mode::read_write>(h);
|
||||
auto aOut =
|
||||
bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
|
||||
cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
|
||||
cl::sycl::access::target::local>
|
||||
scratch(cl::sycl::range<1>(local), h);
|
||||
|
||||
/* The parallel_for invocation chosen is the variant with an nd_item
|
||||
* parameter, since the code requires barriers for correctness. */
|
||||
h.parallel_for<KernelName>(
|
||||
r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
|
||||
size_t globalid = id.get_global(0);
|
||||
size_t localid = id.get_local(0);
|
||||
/* All threads collectively read from global memory into local.
|
||||
* The barrier ensures all threads' IO is resolved before
|
||||
* execution continues (strictly speaking, all threads within
|
||||
* a single work-group - there is no co-ordination between
|
||||
* work-groups, only work-items). */
|
||||
if (globalid < length) {
|
||||
scratch[localid] = aI[globalid];
|
||||
}
|
||||
id.barrier(cl::sycl::access::fence_space::local_space);
|
||||
|
||||
/* Apply the reduction operation between the current local
|
||||
* id and the one on the other half of the vector. */
|
||||
if (globalid < length) {
|
||||
int min = (length < local) ? length : local;
|
||||
for (size_t offset = min / 2; offset > 0; offset /= 2) {
|
||||
if (localid < offset) {
|
||||
scratch[localid] += scratch[localid + offset];
|
||||
}
|
||||
id.barrier(cl::sycl::access::fence_space::local_space);
|
||||
}
|
||||
/* The final result will be stored in local id 0. */
|
||||
if (localid == 0) {
|
||||
aI[id.get_group(0)] = scratch[localid];
|
||||
if((length<=local) && globalid ==0){
|
||||
aOut[globalid]=scratch[localid];
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
};
|
||||
dev.m_queue.submit(f);
|
||||
dev.m_queue.throw_asynchronous();
|
||||
|
||||
/* At this point, you could queue::wait_and_throw() to ensure that
|
||||
* errors are caught quickly. However, this would likely impact
|
||||
* performance negatively. */
|
||||
length = length / local;
|
||||
|
||||
} while (length > 1);
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/// For now let's start with a full reducer
|
||||
/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
|
||||
/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
|
||||
/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
|
||||
// a leafNode.
|
||||
template <typename Self, typename Op, bool Vectorizable>
|
||||
struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
|
||||
|
||||
typedef typename Self::CoeffReturnType CoeffReturnType;
|
||||
static const bool HasOptimizedImplementation = false;
|
||||
|
||||
static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
|
||||
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
|
||||
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||
auto functors = TensorSycl::internal::extractFunctors(self.impl());
|
||||
int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
|
||||
size_t inputSize =self.impl().dimensions().TotalSize();
|
||||
size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
|
||||
size_t remaining = inputSize% red_factor;
|
||||
if(rng ==0) {
|
||||
red_factor=1;
|
||||
};
|
||||
size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
|
||||
size_t GRange=std::max((size_t )1, rng);
|
||||
|
||||
// convert global range to power of 2 for redecution
|
||||
GRange--;
|
||||
GRange |= GRange >> 1;
|
||||
GRange |= GRange >> 2;
|
||||
GRange |= GRange >> 4;
|
||||
GRange |= GRange >> 8;
|
||||
GRange |= GRange >> 16;
|
||||
#if __x86_64__ || __ppc64__ || _WIN64
|
||||
GRange |= GRange >> 32;
|
||||
#endif
|
||||
GRange++;
|
||||
size_t outTileSize = tileSize;
|
||||
/// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
|
||||
if (GRange < outTileSize) outTileSize=GRange;
|
||||
// getting final out buffer at the moment the created buffer is true because there is no need for assign
|
||||
auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
|
||||
/// creating the shared memory for calculating reduction.
|
||||
/// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
|
||||
/// recursively apply reduction on it in order to reduce the whole.
|
||||
auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
|
||||
typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
|
||||
Dims dims= self.xprDims();
|
||||
Op functor = reducer;
|
||||
dev.m_queue.submit([&](cl::sycl::handler &cgh) {
|
||||
// create a tuple of accessors from Evaluator
|
||||
auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
|
||||
auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
|
||||
|
||||
cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
|
||||
typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
|
||||
auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||
/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
|
||||
/// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
|
||||
/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
|
||||
const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
|
||||
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
|
||||
/// the device_evaluator is detectable and recognisable on the device.
|
||||
auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
|
||||
/// const cast added as a naive solution to solve the qualifier drop error
|
||||
auto globalid=itemID.get_global_linear_id();
|
||||
|
||||
if(globalid<rng)
|
||||
tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
|
||||
else
|
||||
tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
|
||||
|
||||
if(remaining!=0 && globalid==0 )
|
||||
// this will add the rest of input buffer when the input size is not devidable to red_factor.
|
||||
tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
|
||||
});
|
||||
});
|
||||
dev.m_queue.throw_asynchronous();
|
||||
|
||||
/// This is used to recursively reduce the tmp value to an element of 1;
|
||||
syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
|
||||
|
||||
typedef typename Self::CoeffReturnType CoeffReturnType;
|
||||
static const bool HasOptimizedImplementation = false;
|
||||
|
||||
static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
|
||||
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
|
||||
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||
auto functors = TensorSycl::internal::extractFunctors(self.impl());
|
||||
|
||||
size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
|
||||
|
||||
size_t GRange=num_coeffs_to_preserve;
|
||||
if (tileSize>GRange) tileSize=GRange;
|
||||
else if(GRange>tileSize){
|
||||
size_t xMode = GRange % tileSize;
|
||||
if (xMode != 0) GRange += (tileSize - xMode);
|
||||
}
|
||||
// getting final out buffer at the moment the created buffer is true because there is no need for assign
|
||||
/// creating the shared memory for calculating reduction.
|
||||
/// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
|
||||
/// recursively apply reduction on it in order to reduce the whole.
|
||||
typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
|
||||
Dims dims= self.xprDims();
|
||||
Op functor = reducer;
|
||||
|
||||
dev.m_queue.submit([&](cl::sycl::handler &cgh) {
|
||||
// create a tuple of accessors from Evaluator
|
||||
auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
|
||||
auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
|
||||
|
||||
cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
|
||||
typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
|
||||
auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||
/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
|
||||
/// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
|
||||
/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
|
||||
const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
|
||||
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
|
||||
/// the device_evaluator is detectable and recognisable on the device.
|
||||
typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
|
||||
auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
|
||||
/// const cast added as a naive solution to solve the qualifier drop error
|
||||
auto globalid=itemID.get_global_linear_id();
|
||||
if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
|
||||
typename DeiceSelf::CoeffReturnType accum = functor.initialize();
|
||||
GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
|
||||
functor.finalize(accum);
|
||||
output_accessor.get_pointer()[globalid]= accum;
|
||||
}
|
||||
});
|
||||
});
|
||||
dev.m_queue.throw_asynchronous();
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
|
|
@ -0,0 +1,429 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_REF_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <typename Dimensions, typename Scalar>
|
||||
class TensorLazyBaseEvaluator {
|
||||
public:
|
||||
TensorLazyBaseEvaluator() : m_refcount(0) { }
|
||||
virtual ~TensorLazyBaseEvaluator() { }
|
||||
|
||||
EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const = 0;
|
||||
EIGEN_DEVICE_FUNC virtual const Scalar* data() const = 0;
|
||||
|
||||
EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const = 0;
|
||||
EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) = 0;
|
||||
|
||||
void incrRefCount() { ++m_refcount; }
|
||||
void decrRefCount() { --m_refcount; }
|
||||
int refCount() const { return m_refcount; }
|
||||
|
||||
private:
|
||||
// No copy, no assigment;
|
||||
TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
|
||||
TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other);
|
||||
|
||||
int m_refcount;
|
||||
};
|
||||
|
||||
|
||||
template <typename Dimensions, typename Expr, typename Device>
|
||||
class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, typename TensorEvaluator<Expr, Device>::Scalar> {
|
||||
public:
|
||||
// typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
|
||||
typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
|
||||
|
||||
TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) {
|
||||
m_dims = m_impl.dimensions();
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
}
|
||||
virtual ~TensorLazyEvaluatorReadOnly() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const {
|
||||
return m_dims;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC virtual const Scalar* data() const {
|
||||
return m_impl.data();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const {
|
||||
return m_impl.coeff(index);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex /*index*/) {
|
||||
eigen_assert(false && "can't reference the coefficient of a rvalue");
|
||||
return m_dummy;
|
||||
};
|
||||
|
||||
protected:
|
||||
TensorEvaluator<Expr, Device> m_impl;
|
||||
Dimensions m_dims;
|
||||
Scalar m_dummy;
|
||||
};
|
||||
|
||||
template <typename Dimensions, typename Expr, typename Device>
|
||||
class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> {
|
||||
public:
|
||||
typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
|
||||
typedef typename Base::Scalar Scalar;
|
||||
|
||||
TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {
|
||||
}
|
||||
virtual ~TensorLazyEvaluatorWritable() {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) {
|
||||
return this->m_impl.coeffRef(index);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Dimensions, typename Expr, typename Device>
|
||||
class TensorLazyEvaluator : public internal::conditional<bool(internal::is_lvalue<Expr>::value),
|
||||
TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
|
||||
TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type {
|
||||
public:
|
||||
typedef typename internal::conditional<bool(internal::is_lvalue<Expr>::value),
|
||||
TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
|
||||
TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type Base;
|
||||
typedef typename Base::Scalar Scalar;
|
||||
|
||||
TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) {
|
||||
}
|
||||
virtual ~TensorLazyEvaluator() {
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
|
||||
/** \class TensorRef
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief A reference to a tensor expression
|
||||
* The expression will be evaluated lazily (as much as possible).
|
||||
*
|
||||
*/
|
||||
template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef<PlainObjectType> >
|
||||
{
|
||||
public:
|
||||
typedef TensorRef<PlainObjectType> Self;
|
||||
typedef typename PlainObjectType::Base Base;
|
||||
typedef typename Eigen::internal::nested<Self>::type Nested;
|
||||
typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
|
||||
typedef typename internal::traits<PlainObjectType>::Index Index;
|
||||
typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename Base::CoeffReturnType CoeffReturnType;
|
||||
typedef Scalar* PointerType;
|
||||
typedef PointerType PointerArgType;
|
||||
|
||||
static const Index NumIndices = PlainObjectType::NumIndices;
|
||||
typedef typename PlainObjectType::Dimensions Dimensions;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
Layout = PlainObjectType::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
|
||||
}
|
||||
|
||||
template <typename Expression>
|
||||
EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice())) {
|
||||
m_evaluator->incrRefCount();
|
||||
}
|
||||
|
||||
template <typename Expression>
|
||||
EIGEN_STRONG_INLINE TensorRef& operator = (const Expression& expr) {
|
||||
unrefEvaluator();
|
||||
m_evaluator = new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice());
|
||||
m_evaluator->incrRefCount();
|
||||
return *this;
|
||||
}
|
||||
|
||||
~TensorRef() {
|
||||
unrefEvaluator();
|
||||
}
|
||||
|
||||
TensorRef(const TensorRef& other) : m_evaluator(other.m_evaluator) {
|
||||
eigen_assert(m_evaluator->refCount() > 0);
|
||||
m_evaluator->incrRefCount();
|
||||
}
|
||||
|
||||
TensorRef& operator = (const TensorRef& other) {
|
||||
if (this != &other) {
|
||||
unrefEvaluator();
|
||||
m_evaluator = other.m_evaluator;
|
||||
eigen_assert(m_evaluator->refCount() > 0);
|
||||
m_evaluator->incrRefCount();
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); }
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); }
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); }
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar operator()(Index index) const
|
||||
{
|
||||
return m_evaluator->coeff(index);
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const
|
||||
{
|
||||
const std::size_t num_indices = (sizeof...(otherIndices) + 1);
|
||||
const array<Index, num_indices> indices{{firstIndex, otherIndices...}};
|
||||
return coeff(indices);
|
||||
}
|
||||
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
|
||||
{
|
||||
const std::size_t num_indices = (sizeof...(otherIndices) + 1);
|
||||
const array<Index, num_indices> indices{{firstIndex, otherIndices...}};
|
||||
return coeffRef(indices);
|
||||
}
|
||||
#else
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1) const
|
||||
{
|
||||
array<Index, 2> indices;
|
||||
indices[0] = i0;
|
||||
indices[1] = i1;
|
||||
return coeff(indices);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2) const
|
||||
{
|
||||
array<Index, 3> indices;
|
||||
indices[0] = i0;
|
||||
indices[1] = i1;
|
||||
indices[2] = i2;
|
||||
return coeff(indices);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3) const
|
||||
{
|
||||
array<Index, 4> indices;
|
||||
indices[0] = i0;
|
||||
indices[1] = i1;
|
||||
indices[2] = i2;
|
||||
indices[3] = i3;
|
||||
return coeff(indices);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
|
||||
{
|
||||
array<Index, 5> indices;
|
||||
indices[0] = i0;
|
||||
indices[1] = i1;
|
||||
indices[2] = i2;
|
||||
indices[3] = i3;
|
||||
indices[4] = i4;
|
||||
return coeff(indices);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1)
|
||||
{
|
||||
array<Index, 2> indices;
|
||||
indices[0] = i0;
|
||||
indices[1] = i1;
|
||||
return coeffRef(indices);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2)
|
||||
{
|
||||
array<Index, 3> indices;
|
||||
indices[0] = i0;
|
||||
indices[1] = i1;
|
||||
indices[2] = i2;
|
||||
return coeffRef(indices);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
|
||||
{
|
||||
array<Index, 4> indices;
|
||||
indices[0] = i0;
|
||||
indices[1] = i1;
|
||||
indices[2] = i2;
|
||||
indices[3] = i3;
|
||||
return coeffRef(indices);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2, Index i3, Index i4)
|
||||
{
|
||||
array<Index, 5> indices;
|
||||
indices[0] = i0;
|
||||
indices[1] = i1;
|
||||
indices[2] = i2;
|
||||
indices[3] = i3;
|
||||
indices[4] = i4;
|
||||
return coeffRef(indices);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar coeff(const array<Index, NumIndices>& indices) const
|
||||
{
|
||||
const Dimensions& dims = this->dimensions();
|
||||
Index index = 0;
|
||||
if (PlainObjectType::Options & RowMajor) {
|
||||
index += indices[0];
|
||||
for (size_t i = 1; i < NumIndices; ++i) {
|
||||
index = index * dims[i] + indices[i];
|
||||
}
|
||||
} else {
|
||||
index += indices[NumIndices-1];
|
||||
for (int i = NumIndices-2; i >= 0; --i) {
|
||||
index = index * dims[i] + indices[i];
|
||||
}
|
||||
}
|
||||
return m_evaluator->coeff(index);
|
||||
}
|
||||
template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
|
||||
{
|
||||
const Dimensions& dims = this->dimensions();
|
||||
Index index = 0;
|
||||
if (PlainObjectType::Options & RowMajor) {
|
||||
index += indices[0];
|
||||
for (size_t i = 1; i < NumIndices; ++i) {
|
||||
index = index * dims[i] + indices[i];
|
||||
}
|
||||
} else {
|
||||
index += indices[NumIndices-1];
|
||||
for (int i = NumIndices-2; i >= 0; --i) {
|
||||
index = index * dims[i] + indices[i];
|
||||
}
|
||||
}
|
||||
return m_evaluator->coeffRef(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
|
||||
{
|
||||
return m_evaluator->coeff(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
|
||||
{
|
||||
return m_evaluator->coeffRef(index);
|
||||
}
|
||||
|
||||
private:
|
||||
EIGEN_STRONG_INLINE void unrefEvaluator() {
|
||||
if (m_evaluator) {
|
||||
m_evaluator->decrRefCount();
|
||||
if (m_evaluator->refCount() == 0) {
|
||||
delete m_evaluator;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
internal::TensorLazyBaseEvaluator<Dimensions, Scalar>* m_evaluator;
|
||||
};
|
||||
|
||||
|
||||
// evaluator for rvalues
|
||||
template<typename Derived, typename Device>
|
||||
struct TensorEvaluator<const TensorRef<Derived>, Device>
|
||||
{
|
||||
typedef typename Derived::Index Index;
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename Derived::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef typename Derived::Dimensions Dimensions;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
Layout = TensorRef<Derived>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
|
||||
: m_ref(m)
|
||||
{ }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
|
||||
return true;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
|
||||
return m_ref.coeff(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
|
||||
return m_ref.coeffRef(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return m_ref.data(); }
|
||||
|
||||
protected:
|
||||
TensorRef<Derived> m_ref;
|
||||
};
|
||||
|
||||
|
||||
// evaluator for lvalues
|
||||
template<typename Derived, typename Device>
|
||||
struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<const TensorRef<Derived>, Device>
|
||||
{
|
||||
typedef typename Derived::Index Index;
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename Derived::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef typename Derived::Dimensions Dimensions;
|
||||
|
||||
typedef TensorEvaluator<const TensorRef<Derived>, Device> Base;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
|
||||
{ }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
|
||||
return this->m_ref.coeffRef(index);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H
|
|
@ -0,0 +1,288 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
|
||||
// Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorReverse
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor reverse elements class.
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename ReverseDimensions, typename XprType>
|
||||
struct traits<TensorReverseOp<ReverseDimensions,
|
||||
XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<typename ReverseDimensions, typename XprType>
|
||||
struct eval<TensorReverseOp<ReverseDimensions, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorReverseOp<ReverseDimensions, XprType>& type;
|
||||
};
|
||||
|
||||
template<typename ReverseDimensions, typename XprType>
|
||||
struct nested<TensorReverseOp<ReverseDimensions, XprType>, 1,
|
||||
typename eval<TensorReverseOp<ReverseDimensions, XprType> >::type>
|
||||
{
|
||||
typedef TensorReverseOp<ReverseDimensions, XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
template<typename ReverseDimensions, typename XprType>
|
||||
class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
|
||||
XprType>, WriteAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
|
||||
StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
|
||||
const XprType& expr, const ReverseDimensions& reverse_dims)
|
||||
: m_xpr(expr), m_reverse_dims(reverse_dims) { }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const ReverseDimensions& reverse() const { return m_reverse_dims; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorReverseOp& operator = (const TensorReverseOp& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorReverseOp, const TensorReverseOp> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorReverseOp& operator = (const OtherDerived& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorReverseOp, const OtherDerived> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const ReverseDimensions m_reverse_dims;
|
||||
};
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename ReverseDimensions, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device>
|
||||
{
|
||||
typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<ReverseDimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
|
||||
const Device& device)
|
||||
: m_impl(op.expression(), device), m_reverse(op.reverse())
|
||||
{
|
||||
// Reversing a scalar isn't supported yet. It would be a no-op anyway.
|
||||
EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
|
||||
// Compute strides
|
||||
m_dimensions = m_impl.dimensions();
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_strides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
m_strides[i] = m_strides[i-1] * m_dimensions[i-1];
|
||||
}
|
||||
} else {
|
||||
m_strides[NumDims-1] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex(
|
||||
Index index) const {
|
||||
eigen_assert(index < dimensions().TotalSize());
|
||||
Index inputIndex = 0;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
Index idx = index / m_strides[i];
|
||||
index -= idx * m_strides[i];
|
||||
if (m_reverse[i]) {
|
||||
idx = m_dimensions[i] - idx - 1;
|
||||
}
|
||||
inputIndex += idx * m_strides[i] ;
|
||||
}
|
||||
if (m_reverse[0]) {
|
||||
inputIndex += (m_dimensions[0] - index - 1);
|
||||
} else {
|
||||
inputIndex += index;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
Index idx = index / m_strides[i];
|
||||
index -= idx * m_strides[i];
|
||||
if (m_reverse[i]) {
|
||||
idx = m_dimensions[i] - idx - 1;
|
||||
}
|
||||
inputIndex += idx * m_strides[i] ;
|
||||
}
|
||||
if (m_reverse[NumDims-1]) {
|
||||
inputIndex += (m_dimensions[NumDims-1] - index - 1);
|
||||
} else {
|
||||
inputIndex += index;
|
||||
}
|
||||
}
|
||||
return inputIndex;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(
|
||||
Index index) const {
|
||||
return m_impl.coeff(reverseIndex(index));
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
PacketReturnType packet(Index index) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
// TODO(ndjaitly): write a better packing routine that uses
|
||||
// local structure.
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type
|
||||
values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
|
||||
2 * TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>());
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
if (m_reverse[i]) {
|
||||
compute_cost += 2 * TensorOpCost::AddCost<Index>();
|
||||
}
|
||||
}
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
Dimensions m_dimensions;
|
||||
array<Index, NumDims> m_strides;
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
ReverseDimensions m_reverse;
|
||||
};
|
||||
|
||||
// Eval as lvalue
|
||||
|
||||
template <typename ReverseDimensions, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
|
||||
: public TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>,
|
||||
Device> {
|
||||
typedef TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>,
|
||||
Device> Base;
|
||||
typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<ReverseDimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
|
||||
const Device& device)
|
||||
: Base(op, device) {}
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const Dimensions& dimensions() const { return this->m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
|
||||
return this->m_impl.coeffRef(this->reverseIndex(index));
|
||||
}
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x) {
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
// This code is pilfered from TensorMorphing.h
|
||||
EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
this->coeffRef(index+i) = values[i];
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
|
|
@ -0,0 +1,287 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <typename Op, typename XprType>
|
||||
struct traits<TensorScanOp<Op, XprType> >
|
||||
: public traits<XprType> {
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<typename Op, typename XprType>
|
||||
struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorScanOp<Op, XprType>& type;
|
||||
};
|
||||
|
||||
template<typename Op, typename XprType>
|
||||
struct nested<TensorScanOp<Op, XprType>, 1,
|
||||
typename eval<TensorScanOp<Op, XprType> >::type>
|
||||
{
|
||||
typedef TensorScanOp<Op, XprType> type;
|
||||
};
|
||||
} // end namespace internal
|
||||
|
||||
/** \class TensorScan
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor scan class.
|
||||
*/
|
||||
template <typename Op, typename XprType>
|
||||
class TensorScanOp
|
||||
: public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
|
||||
const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
|
||||
: m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const Index axis() const { return m_axis; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const XprType& expression() const { return m_expr; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const Op accumulator() const { return m_accumulator; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
bool exclusive() const { return m_exclusive; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_expr;
|
||||
const Index m_axis;
|
||||
const Op m_accumulator;
|
||||
const bool m_exclusive;
|
||||
};
|
||||
|
||||
template <typename Self, typename Reducer, typename Device>
|
||||
struct ScanLauncher;
|
||||
|
||||
// Eval as rvalue
|
||||
template <typename Op, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
|
||||
|
||||
typedef TensorScanOp<Op, ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
|
||||
const Device& device)
|
||||
: m_impl(op.expression(), device),
|
||||
m_device(device),
|
||||
m_exclusive(op.exclusive()),
|
||||
m_accumulator(op.accumulator()),
|
||||
m_size(m_impl.dimensions()[op.axis()]),
|
||||
m_stride(1),
|
||||
m_output(NULL) {
|
||||
|
||||
// Accumulating a scalar isn't supported.
|
||||
EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
|
||||
|
||||
// Compute stride of scan axis
|
||||
const Dimensions& dims = m_impl.dimensions();
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = 0; i < op.axis(); ++i) {
|
||||
m_stride = m_stride * dims[i];
|
||||
}
|
||||
} else {
|
||||
for (int i = NumDims - 1; i > op.axis(); --i) {
|
||||
m_stride = m_stride * dims[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
|
||||
return m_impl.dimensions();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
|
||||
return m_stride;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
|
||||
return m_size;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
|
||||
return m_accumulator;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
|
||||
return m_exclusive;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
|
||||
return m_impl;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
|
||||
return m_device;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
ScanLauncher<Self, Op, Device> launcher;
|
||||
if (data) {
|
||||
launcher(*this, data);
|
||||
return false;
|
||||
}
|
||||
|
||||
const Index total_size = internal::array_prod(dimensions());
|
||||
m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
|
||||
launcher(*this, m_output);
|
||||
return true;
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
|
||||
{
|
||||
return m_output;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_output[index];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
if (m_output != NULL) {
|
||||
m_device.deallocate(m_output);
|
||||
m_output = NULL;
|
||||
}
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
protected:
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
const Device& m_device;
|
||||
const bool m_exclusive;
|
||||
Op m_accumulator;
|
||||
const Index m_size;
|
||||
Index m_stride;
|
||||
CoeffReturnType* m_output;
|
||||
};
|
||||
|
||||
// CPU implementation of scan
|
||||
// TODO(ibab) This single-threaded implementation should be parallelized,
|
||||
// at least by running multiple scans at the same time.
|
||||
template <typename Self, typename Reducer, typename Device>
|
||||
struct ScanLauncher {
|
||||
void operator()(Self& self, typename Self::CoeffReturnType *data) {
|
||||
Index total_size = internal::array_prod(self.dimensions());
|
||||
|
||||
// We fix the index along the scan axis to 0 and perform a
|
||||
// scan per remaining entry. The iteration is split into two nested
|
||||
// loops to avoid an integer division by keeping track of each idx1 and idx2.
|
||||
for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
|
||||
for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
|
||||
// Calculate the starting offset for the scan
|
||||
Index offset = idx1 + idx2;
|
||||
|
||||
// Compute the scan along the axis, starting at the calculated offset
|
||||
typename Self::CoeffReturnType accum = self.accumulator().initialize();
|
||||
for (Index idx3 = 0; idx3 < self.size(); idx3++) {
|
||||
Index curr = offset + idx3 * self.stride();
|
||||
|
||||
if (self.exclusive()) {
|
||||
data[curr] = self.accumulator().finalize(accum);
|
||||
self.accumulator().reduce(self.inner().coeff(curr), &accum);
|
||||
} else {
|
||||
self.accumulator().reduce(self.inner().coeff(curr), &accum);
|
||||
data[curr] = self.accumulator().finalize(accum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
|
||||
// GPU implementation of scan
|
||||
// TODO(ibab) This placeholder implementation performs multiple scans in
|
||||
// parallel, but it would be better to use a parallel scan algorithm and
|
||||
// optimize memory access.
|
||||
template <typename Self, typename Reducer>
|
||||
__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
|
||||
// Compute offset as in the CPU version
|
||||
Index val = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
|
||||
|
||||
if (offset + (self.size() - 1) * self.stride() < total_size) {
|
||||
// Compute the scan along the axis, starting at the calculated offset
|
||||
typename Self::CoeffReturnType accum = self.accumulator().initialize();
|
||||
for (Index idx = 0; idx < self.size(); idx++) {
|
||||
Index curr = offset + idx * self.stride();
|
||||
if (self.exclusive()) {
|
||||
data[curr] = self.accumulator().finalize(accum);
|
||||
self.accumulator().reduce(self.inner().coeff(curr), &accum);
|
||||
} else {
|
||||
self.accumulator().reduce(self.inner().coeff(curr), &accum);
|
||||
data[curr] = self.accumulator().finalize(accum);
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
}
|
||||
|
||||
template <typename Self, typename Reducer>
|
||||
struct ScanLauncher<Self, Reducer, GpuDevice> {
|
||||
void operator()(const Self& self, typename Self::CoeffReturnType* data) {
|
||||
Index total_size = internal::array_prod(self.dimensions());
|
||||
Index num_blocks = (total_size / self.size() + 63) / 64;
|
||||
Index block_size = 64;
|
||||
LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
|
||||
}
|
||||
};
|
||||
#endif // EIGEN_USE_GPU && __CUDACC__
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
|
|
@ -0,0 +1,264 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorShuffling
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor shuffling class.
|
||||
*
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename Shuffle, typename XprType>
|
||||
struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<typename Shuffle, typename XprType>
|
||||
struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorShufflingOp<Shuffle, XprType>& type;
|
||||
};
|
||||
|
||||
template<typename Shuffle, typename XprType>
|
||||
struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type>
|
||||
{
|
||||
typedef TensorShufflingOp<Shuffle, XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename Shuffle, typename XprType>
|
||||
class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shuffle)
|
||||
: m_xpr(expr), m_shuffle(shuffle) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const Shuffle& shufflePermutation() const { return m_shuffle; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const TensorShufflingOp& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorShufflingOp, const TensorShufflingOp> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorShufflingOp, const OtherDerived> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const Shuffle m_shuffle;
|
||||
};
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename Shuffle, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
{
|
||||
typedef TensorShufflingOp<Shuffle, ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device)
|
||||
{
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
const Shuffle& shuffle = op.shufflePermutation();
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
m_dimensions[i] = input_dims[shuffle[i]];
|
||||
}
|
||||
|
||||
array<Index, NumDims> inputStrides;
|
||||
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
inputStrides[0] = 1;
|
||||
m_outputStrides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
|
||||
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
|
||||
}
|
||||
} else {
|
||||
inputStrides[NumDims - 1] = 1;
|
||||
m_outputStrides[NumDims - 1] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
|
||||
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
m_inputStrides[i] = inputStrides[shuffle[i]];
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_impl.coeff(srcCoeff(index));
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
|
||||
2 * TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>());
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
|
||||
Index inputIndex = 0;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx = index / m_outputStrides[i];
|
||||
inputIndex += idx * m_inputStrides[i];
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
return inputIndex + index * m_inputStrides[0];
|
||||
} else {
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const Index idx = index / m_outputStrides[i];
|
||||
inputIndex += idx * m_inputStrides[i];
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
return inputIndex + index * m_inputStrides[NumDims - 1];
|
||||
}
|
||||
}
|
||||
|
||||
Dimensions m_dimensions;
|
||||
array<Index, NumDims> m_outputStrides;
|
||||
array<Index, NumDims> m_inputStrides;
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
};
|
||||
|
||||
|
||||
// Eval as lvalue
|
||||
template<typename Shuffle, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
: public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
{
|
||||
typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base;
|
||||
|
||||
typedef TensorShufflingOp<Shuffle, ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: Base(op, device)
|
||||
{ }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
{
|
||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||
}
|
||||
|
||||
template <int StoreMode> EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
this->coeffRef(index+i) = values[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
|
|
@ -0,0 +1,146 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
|
||||
// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
|
||||
|
||||
#ifdef EIGEN_TENSOR_STORAGE_CTOR_PLUGIN
|
||||
#define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN EIGEN_TENSOR_STORAGE_CTOR_PLUGIN;
|
||||
#else
|
||||
#define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN
|
||||
#endif
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class TensorStorage
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Stores the data of a tensor
|
||||
*
|
||||
* This class stores the data of fixed-size, dynamic-size or mixed tensors
|
||||
* in a way as compact as possible.
|
||||
*
|
||||
* \sa Tensor
|
||||
*/
|
||||
template<typename T, typename Dimensions, int Options> class TensorStorage;
|
||||
|
||||
|
||||
// Pure fixed-size storage
|
||||
template<typename T, typename FixedDimensions, int Options_>
|
||||
class TensorStorage
|
||||
{
|
||||
private:
|
||||
static const std::size_t Size = FixedDimensions::total_size;
|
||||
|
||||
// Allocate an array of size at least one to prevent compiler warnings.
|
||||
static const std::size_t MinSize = max_n_1<Size>::size;
|
||||
EIGEN_ALIGN_MAX T m_data[MinSize];
|
||||
|
||||
FixedDimensions m_dimensions;
|
||||
|
||||
public:
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorStorage() {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE T *data() { return m_data; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const T *data() const { return m_data; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const FixedDimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); }
|
||||
};
|
||||
|
||||
|
||||
// pure dynamic
|
||||
template<typename T, typename IndexType, int NumIndices_, int Options_>
|
||||
class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
|
||||
{
|
||||
public:
|
||||
typedef IndexType Index;
|
||||
typedef DSizes<IndexType, NumIndices_> Dimensions;
|
||||
typedef TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> Self;
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() {
|
||||
if (NumIndices_ == 0) {
|
||||
m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1);
|
||||
}
|
||||
}
|
||||
EIGEN_DEVICE_FUNC TensorStorage(internal::constructor_without_unaligned_array_assert)
|
||||
: m_data(0), m_dimensions(internal::template repeat<NumIndices_, Index>(0)) {}
|
||||
EIGEN_DEVICE_FUNC TensorStorage(Index size, const array<Index, NumIndices_>& dimensions)
|
||||
: m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
|
||||
{ EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template <typename... DenseIndex>
|
||||
EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) {
|
||||
m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(m_dimensions));
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorStorage(const Self& other)
|
||||
: m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(other.m_dimensions)))
|
||||
, m_dimensions(other.m_dimensions)
|
||||
{
|
||||
internal::smart_copy(other.m_data, other.m_data+internal::array_prod(other.m_dimensions), m_data);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC Self& operator=(const Self& other)
|
||||
{
|
||||
if (this != &other) {
|
||||
Self tmp(other);
|
||||
this->swap(tmp);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC ~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); }
|
||||
EIGEN_DEVICE_FUNC void swap(Self& other)
|
||||
{ numext::swap(m_data,other.m_data); numext::swap(m_dimensions,other.m_dimensions); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {return m_dimensions;}
|
||||
|
||||
EIGEN_DEVICE_FUNC void resize(Index size, const array<Index, NumIndices_>& nbDimensions)
|
||||
{
|
||||
const Index currentSz = internal::array_prod(m_dimensions);
|
||||
if(size != currentSz)
|
||||
{
|
||||
internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, currentSz);
|
||||
if (size)
|
||||
m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size);
|
||||
else if (NumIndices_ == 0) {
|
||||
m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1);
|
||||
}
|
||||
else
|
||||
m_data = 0;
|
||||
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
|
||||
}
|
||||
m_dimensions = nbDimensions;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
|
||||
|
||||
private:
|
||||
T *m_data;
|
||||
Dimensions m_dimensions;
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
|
|
@ -0,0 +1,338 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorStriding
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor striding class.
|
||||
*
|
||||
*
|
||||
*/
|
||||
namespace internal {
|
||||
template<typename Strides, typename XprType>
|
||||
struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<typename Strides, typename XprType>
|
||||
struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorStridingOp<Strides, XprType>& type;
|
||||
};
|
||||
|
||||
template<typename Strides, typename XprType>
|
||||
struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridingOp<Strides, XprType> >::type>
|
||||
{
|
||||
typedef TensorStridingOp<Strides, XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename Strides, typename XprType>
|
||||
class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
|
||||
: m_xpr(expr), m_dims(dims) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const Strides& strides() const { return m_dims; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorStridingOp& operator = (const TensorStridingOp& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorStridingOp, const TensorStridingOp> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorStridingOp& operator = (const OtherDerived& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorStridingOp, const OtherDerived> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const Strides m_dims;
|
||||
};
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename Strides, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
{
|
||||
typedef TensorStridingOp<Strides, ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device)
|
||||
{
|
||||
m_dimensions = m_impl.dimensions();
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
|
||||
}
|
||||
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_outputStrides[0] = 1;
|
||||
m_inputStrides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
|
||||
m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
|
||||
m_inputStrides[i-1] *= op.strides()[i-1];
|
||||
}
|
||||
m_inputStrides[NumDims-1] *= op.strides()[NumDims-1];
|
||||
} else { // RowMajor
|
||||
m_outputStrides[NumDims-1] = 1;
|
||||
m_inputStrides[NumDims-1] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
|
||||
m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
|
||||
m_inputStrides[i+1] *= op.strides()[i+1];
|
||||
}
|
||||
m_inputStrides[0] *= op.strides()[0];
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_impl.coeff(srcCoeff(index));
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
Index inputIndices[] = {0, 0};
|
||||
Index indices[] = {index, index + PacketSize - 1};
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx0 = indices[0] / m_outputStrides[i];
|
||||
const Index idx1 = indices[1] / m_outputStrides[i];
|
||||
inputIndices[0] += idx0 * m_inputStrides[i];
|
||||
inputIndices[1] += idx1 * m_inputStrides[i];
|
||||
indices[0] -= idx0 * m_outputStrides[i];
|
||||
indices[1] -= idx1 * m_outputStrides[i];
|
||||
}
|
||||
inputIndices[0] += indices[0] * m_inputStrides[0];
|
||||
inputIndices[1] += indices[1] * m_inputStrides[0];
|
||||
} else { // RowMajor
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const Index idx0 = indices[0] / m_outputStrides[i];
|
||||
const Index idx1 = indices[1] / m_outputStrides[i];
|
||||
inputIndices[0] += idx0 * m_inputStrides[i];
|
||||
inputIndices[1] += idx1 * m_inputStrides[i];
|
||||
indices[0] -= idx0 * m_outputStrides[i];
|
||||
indices[1] -= idx1 * m_outputStrides[i];
|
||||
}
|
||||
inputIndices[0] += indices[0] * m_inputStrides[NumDims-1];
|
||||
inputIndices[1] += indices[1] * m_inputStrides[NumDims-1];
|
||||
}
|
||||
if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
|
||||
PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
|
||||
return rslt;
|
||||
}
|
||||
else {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
values[0] = m_impl.coeff(inputIndices[0]);
|
||||
values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
|
||||
for (int i = 1; i < PacketSize-1; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost<Index>() +
|
||||
TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>()) +
|
||||
TensorOpCost::MulCost<Index>();
|
||||
if (vectorized) {
|
||||
compute_cost *= 2; // packet() computes two indices
|
||||
}
|
||||
const int innerDim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : (NumDims - 1);
|
||||
return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) +
|
||||
// Computation is not vectorized per se, but it is done once per packet.
|
||||
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
|
||||
{
|
||||
Index inputIndex = 0;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx = index / m_outputStrides[i];
|
||||
inputIndex += idx * m_inputStrides[i];
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
inputIndex += index * m_inputStrides[0];
|
||||
} else { // RowMajor
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const Index idx = index / m_outputStrides[i];
|
||||
inputIndex += idx * m_inputStrides[i];
|
||||
index -= idx * m_outputStrides[i];
|
||||
}
|
||||
inputIndex += index * m_inputStrides[NumDims-1];
|
||||
}
|
||||
return inputIndex;
|
||||
}
|
||||
|
||||
Dimensions m_dimensions;
|
||||
array<Index, NumDims> m_outputStrides;
|
||||
array<Index, NumDims> m_inputStrides;
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
};
|
||||
|
||||
|
||||
// Eval as lvalue
|
||||
template<typename Strides, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
|
||||
: public TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
{
|
||||
typedef TensorStridingOp<Strides, ArgType> XprType;
|
||||
typedef TensorEvaluator<const XprType, Device> Base;
|
||||
// typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
// typedef DSizes<Index, NumDims> Dimensions;
|
||||
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: Base(op, device) { }
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
|
||||
{
|
||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||
}
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
|
||||
|
||||
Index inputIndices[] = {0, 0};
|
||||
Index indices[] = {index, index + PacketSize - 1};
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx0 = indices[0] / this->m_outputStrides[i];
|
||||
const Index idx1 = indices[1] / this->m_outputStrides[i];
|
||||
inputIndices[0] += idx0 * this->m_inputStrides[i];
|
||||
inputIndices[1] += idx1 * this->m_inputStrides[i];
|
||||
indices[0] -= idx0 * this->m_outputStrides[i];
|
||||
indices[1] -= idx1 * this->m_outputStrides[i];
|
||||
}
|
||||
inputIndices[0] += indices[0] * this->m_inputStrides[0];
|
||||
inputIndices[1] += indices[1] * this->m_inputStrides[0];
|
||||
} else { // RowMajor
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const Index idx0 = indices[0] / this->m_outputStrides[i];
|
||||
const Index idx1 = indices[1] / this->m_outputStrides[i];
|
||||
inputIndices[0] += idx0 * this->m_inputStrides[i];
|
||||
inputIndices[1] += idx1 * this->m_inputStrides[i];
|
||||
indices[0] -= idx0 * this->m_outputStrides[i];
|
||||
indices[1] -= idx1 * this->m_outputStrides[i];
|
||||
}
|
||||
inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1];
|
||||
inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1];
|
||||
}
|
||||
if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
|
||||
this->m_impl.template writePacket<Unaligned>(inputIndices[0], x);
|
||||
}
|
||||
else {
|
||||
EIGEN_ALIGN_MAX Scalar values[PacketSize];
|
||||
internal::pstore<Scalar, PacketReturnType>(values, x);
|
||||
this->m_impl.coeffRef(inputIndices[0]) = values[0];
|
||||
this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1];
|
||||
for (int i = 1; i < PacketSize-1; ++i) {
|
||||
this->coeffRef(index+i) = values[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
|
|
@ -0,0 +1,82 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: eigen@codeplay.com
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
// General include header of SYCL target for Tensor Module
|
||||
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
|
||||
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
|
||||
// global pointer to set different attribute state for a class
|
||||
template <class T>
|
||||
struct MakeGlobalPointer {
|
||||
typedef typename cl::sycl::global_ptr<T>::pointer_t Type;
|
||||
};
|
||||
|
||||
// global pointer to set different attribute state for a class
|
||||
template <class T>
|
||||
struct MakeLocalPointer {
|
||||
typedef typename cl::sycl::local_ptr<T>::pointer_t Type;
|
||||
};
|
||||
|
||||
|
||||
namespace Eigen {
|
||||
namespace TensorSycl {
|
||||
namespace internal {
|
||||
|
||||
/// This struct is used for special expression nodes with no operations (for example assign and selectOP).
|
||||
struct NoOP;
|
||||
|
||||
template<bool IsConst, typename T> struct GetType{
|
||||
typedef const T Type;
|
||||
};
|
||||
template<typename T> struct GetType<false, T>{
|
||||
typedef T Type;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// tuple construction
|
||||
#include "TensorSyclTuple.h"
|
||||
|
||||
// counting number of leaf at compile time
|
||||
#include "TensorSyclLeafCount.h"
|
||||
|
||||
// The index PlaceHolder takes the actual expression and replaces the actual
|
||||
// data on it with the place holder. It uses the same pre-order expression tree
|
||||
// traverse as the leaf count in order to give the right access number to each
|
||||
// node in the expression
|
||||
#include "TensorSyclPlaceHolderExpr.h"
|
||||
|
||||
// creation of an accessor tuple from a tuple of SYCL buffers
|
||||
#include "TensorSyclExtractAccessor.h"
|
||||
|
||||
// this is used to change the address space type in tensor map for GPU
|
||||
#include "TensorSyclConvertToDeviceExpression.h"
|
||||
|
||||
// this is used to extract the functors
|
||||
#include "TensorSyclExtractFunctors.h"
|
||||
|
||||
// this is used to create tensormap on the device
|
||||
// this is used to construct the expression on the device
|
||||
#include "TensorSyclExprConstructor.h"
|
||||
|
||||
/// this is used for extracting tensor reduction
|
||||
#include "TensorReductionSycl.h"
|
||||
|
||||
// kernel execution using fusion
|
||||
#include "TensorSyclRun.h"
|
||||
|
||||
#endif // end of EIGEN_USE_SYCL
|
||||
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
|
|
@ -0,0 +1,121 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
/*****************************************************************
|
||||
* TensorSyclConvertToDeviceExpression.h
|
||||
*
|
||||
* \brief:
|
||||
* Conversion from host pointer to device pointer
|
||||
* inside leaf nodes of the expression.
|
||||
*
|
||||
*****************************************************************/
|
||||
|
||||
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP
|
||||
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP
|
||||
|
||||
namespace Eigen {
|
||||
namespace TensorSycl {
|
||||
namespace internal {
|
||||
|
||||
/// \struct ConvertToDeviceExpression
|
||||
/// \brief This struct is used to convert the MakePointer in the host expression
|
||||
/// to the MakeGlobalPointer for the device expression. For the leafNodes
|
||||
/// containing the pointer. This is due to the fact that the address space of
|
||||
/// the pointer T* is different on the host and the device.
|
||||
template <typename Expr>
|
||||
struct ConvertToDeviceExpression;
|
||||
|
||||
template<template<class...> class NonOpCategory, bool IsConst, typename... Args>
|
||||
struct NonOpConversion{
|
||||
typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type...> >::Type Type;
|
||||
};
|
||||
|
||||
|
||||
template<template<class, template <class> class > class NonOpCategory, bool IsConst, typename Args>
|
||||
struct DeviceConvertor{
|
||||
typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type, MakeGlobalPointer> >::Type Type;
|
||||
};
|
||||
|
||||
/// specialisation of the \ref ConvertToDeviceExpression struct when the node
|
||||
/// type is TensorMap
|
||||
#define TENSORMAPCONVERT(CVQual)\
|
||||
template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_>\
|
||||
struct ConvertToDeviceExpression<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_> > {\
|
||||
typedef CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer> Type;\
|
||||
};
|
||||
|
||||
TENSORMAPCONVERT(const)
|
||||
TENSORMAPCONVERT()
|
||||
#undef TENSORMAPCONVERT
|
||||
|
||||
/// specialisation of the \ref ConvertToDeviceExpression struct when the node
|
||||
/// type is TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, TensorBroadcastingOp
|
||||
#define CATEGORYCONVERT(CVQual)\
|
||||
template <template<class, class...> class Category, typename OP, typename... subExprs>\
|
||||
struct ConvertToDeviceExpression<CVQual Category<OP, subExprs...> > {\
|
||||
typedef CVQual Category<OP, typename ConvertToDeviceExpression<subExprs>::Type... > Type;\
|
||||
};
|
||||
CATEGORYCONVERT(const)
|
||||
CATEGORYCONVERT()
|
||||
#undef CATEGORYCONVERT
|
||||
|
||||
|
||||
/// specialisation of the \ref ConvertToDeviceExpression struct when the node
|
||||
/// type is TensorCwiseSelectOp
|
||||
#define SELECTOPCONVERT(CVQual, Res)\
|
||||
template <typename IfExpr, typename ThenExpr, typename ElseExpr>\
|
||||
struct ConvertToDeviceExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >\
|
||||
: NonOpConversion<TensorSelectOp, Res, IfExpr, ThenExpr, ElseExpr> {};
|
||||
SELECTOPCONVERT(const, true)
|
||||
SELECTOPCONVERT(, false)
|
||||
#undef SELECTOPCONVERT
|
||||
|
||||
/// specialisation of the \ref ConvertToDeviceExpression struct when the node
|
||||
/// type is const AssingOP
|
||||
#define ASSIGNCONVERT(CVQual, Res)\
|
||||
template <typename LHSExpr, typename RHSExpr>\
|
||||
struct ConvertToDeviceExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr> >\
|
||||
: NonOpConversion<TensorAssignOp, Res, LHSExpr, RHSExpr>{};
|
||||
|
||||
ASSIGNCONVERT(const, true)
|
||||
ASSIGNCONVERT(, false)
|
||||
#undef ASSIGNCONVERT
|
||||
|
||||
/// specialisation of the \ref ConvertToDeviceExpression struct when the node
|
||||
/// type is either TensorForcedEvalOp or TensorEvalToOp
|
||||
#define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\
|
||||
template <typename Expr>\
|
||||
struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
|
||||
: DeviceConvertor<ExprNode, Res, Expr>{};
|
||||
|
||||
KERNELBROKERCONVERT(const, true, TensorForcedEvalOp)
|
||||
KERNELBROKERCONVERT(, false, TensorForcedEvalOp)
|
||||
KERNELBROKERCONVERT(const, true, TensorEvalToOp)
|
||||
KERNELBROKERCONVERT(, false, TensorEvalToOp)
|
||||
#undef KERNELBROKERCONVERT
|
||||
|
||||
/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
|
||||
#define KERNELBROKERCONVERTREDUCTION(CVQual)\
|
||||
template <typename OP, typename Dim, typename subExpr, template <class> class MakePointer_>\
|
||||
struct ConvertToDeviceExpression<CVQual TensorReductionOp<OP, Dim, subExpr, MakePointer_> > {\
|
||||
typedef CVQual TensorReductionOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type, MakeGlobalPointer> Type;\
|
||||
};
|
||||
|
||||
KERNELBROKERCONVERTREDUCTION(const)
|
||||
KERNELBROKERCONVERTREDUCTION()
|
||||
#undef KERNELBROKERCONVERTREDUCTION
|
||||
|
||||
} // namespace internal
|
||||
} // namespace TensorSycl
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // UNSUPPORTED_EIGEN_CXX1
|
|
@ -0,0 +1,239 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
/*****************************************************************
|
||||
* TensorSyclExprConstructor.h
|
||||
*
|
||||
* \brief:
|
||||
* This file re-create an expression on the SYCL device in order
|
||||
* to use the original tensor evaluator.
|
||||
*
|
||||
*****************************************************************/
|
||||
|
||||
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
|
||||
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
|
||||
|
||||
namespace Eigen {
|
||||
namespace TensorSycl {
|
||||
namespace internal {
|
||||
/// this class is used by EvalToOp in order to create an lhs expression which is
|
||||
/// a pointer from an accessor on device-only buffer
|
||||
template <typename PtrType, size_t N, typename... Params>
|
||||
struct EvalToLHSConstructor {
|
||||
PtrType expr;
|
||||
EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t): expr((&(*(utility::tuple::get<N>(t).get_pointer())))) {}
|
||||
};
|
||||
|
||||
/// struct ExprConstructor is used to reconstruct the expression on the device and
|
||||
/// recreate the expression with MakeGlobalPointer containing the device address
|
||||
/// space for the TensorMap pointers used in eval function.
|
||||
/// It receives the original expression type, the functor of the node, the tuple
|
||||
/// of accessors, and the device expression type to re-instantiate the
|
||||
/// expression tree for the device
|
||||
template <typename OrigExpr, typename IndexExpr, typename... Params>
|
||||
struct ExprConstructor;
|
||||
|
||||
/// specialisation of the \ref ExprConstructor struct when the node type is
|
||||
/// TensorMap
|
||||
#define TENSORMAP(CVQual)\
|
||||
template <typename Scalar_, int Options_, int Options2_, int Options3_, int NumIndices_, typename IndexType_,\
|
||||
template <class> class MakePointer_, size_t N, typename... Params>\
|
||||
struct ExprConstructor< CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer>,\
|
||||
CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options3_, MakePointer_>, N>, Params...>{\
|
||||
typedef CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer> Type;\
|
||||
Type expr;\
|
||||
template <typename FuncDetector>\
|
||||
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
|
||||
: expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
|
||||
};
|
||||
|
||||
TENSORMAP(const)
|
||||
TENSORMAP()
|
||||
#undef TENSORMAP
|
||||
|
||||
#define UNARYCATEGORY(CVQual)\
|
||||
template <template<class, class> class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\
|
||||
struct ExprConstructor<CVQual UnaryCategory<OP, OrigRHSExpr>, CVQual UnaryCategory<OP, RHSExpr>, Params...> {\
|
||||
typedef ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_type;\
|
||||
my_type rhsExpr;\
|
||||
typedef CVQual UnaryCategory<OP, typename my_type::Type> Type;\
|
||||
Type expr;\
|
||||
template <typename FuncDetector>\
|
||||
ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
|
||||
: rhsExpr(funcD.rhsExpr, t), expr(rhsExpr.expr, funcD.func) {}\
|
||||
};
|
||||
|
||||
UNARYCATEGORY(const)
|
||||
UNARYCATEGORY()
|
||||
#undef UNARYCATEGORY
|
||||
|
||||
/// specialisation of the \ref ExprConstructor struct when the node type is
|
||||
/// TensorBinaryOp
|
||||
#define BINARYCATEGORY(CVQual)\
|
||||
template <template<class, class, class> class BinaryCategory, typename OP, typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr,\
|
||||
typename RHSExpr, typename... Params>\
|
||||
struct ExprConstructor<CVQual BinaryCategory<OP, OrigLHSExpr, OrigRHSExpr>, CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Params...> {\
|
||||
typedef ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\
|
||||
typedef ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\
|
||||
typedef CVQual BinaryCategory<OP, typename my_left_type::Type, typename my_right_type::Type> Type;\
|
||||
my_left_type lhsExpr;\
|
||||
my_right_type rhsExpr;\
|
||||
Type expr;\
|
||||
template <typename FuncDetector>\
|
||||
ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
|
||||
: lhsExpr(funcD.lhsExpr, t),rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr, funcD.func) {}\
|
||||
};
|
||||
|
||||
BINARYCATEGORY(const)
|
||||
BINARYCATEGORY()
|
||||
#undef BINARYCATEGORY
|
||||
|
||||
/// specialisation of the \ref ExprConstructor struct when the node type is
|
||||
/// TensorCwiseTernaryOp
|
||||
#define TERNARYCATEGORY(CVQual)\
|
||||
template <template <class, class, class, class> class TernaryCategory, typename OP, typename OrigArg1Expr, typename OrigArg2Expr,typename OrigArg3Expr,\
|
||||
typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename... Params>\
|
||||
struct ExprConstructor<CVQual TernaryCategory<OP, OrigArg1Expr, OrigArg2Expr, OrigArg3Expr>, CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Params...> {\
|
||||
typedef ExprConstructor<OrigArg1Expr, Arg1Expr, Params...> my_arg1_type;\
|
||||
typedef ExprConstructor<OrigArg2Expr, Arg2Expr, Params...> my_arg2_type;\
|
||||
typedef ExprConstructor<OrigArg3Expr, Arg3Expr, Params...> my_arg3_type;\
|
||||
typedef CVQual TernaryCategory<OP, typename my_arg1_type::Type, typename my_arg2_type::Type, typename my_arg3_type::Type> Type;\
|
||||
my_arg1_type arg1Expr;\
|
||||
my_arg2_type arg2Expr;\
|
||||
my_arg3_type arg3Expr;\
|
||||
Type expr;\
|
||||
template <typename FuncDetector>\
|
||||
ExprConstructor(FuncDetector &funcD,const utility::tuple::Tuple<Params...> &t)\
|
||||
: arg1Expr(funcD.arg1Expr, t), arg2Expr(funcD.arg2Expr, t), arg3Expr(funcD.arg3Expr, t), expr(arg1Expr.expr, arg2Expr.expr, arg3Expr.expr, funcD.func) {}\
|
||||
};
|
||||
|
||||
TERNARYCATEGORY(const)
|
||||
TERNARYCATEGORY()
|
||||
#undef TERNARYCATEGORY
|
||||
|
||||
/// specialisation of the \ref ExprConstructor struct when the node type is
|
||||
/// TensorCwiseSelectOp
|
||||
#define SELECTOP(CVQual)\
|
||||
template <typename OrigIfExpr, typename OrigThenExpr, typename OrigElseExpr, typename IfExpr, typename ThenExpr, typename ElseExpr, typename... Params>\
|
||||
struct ExprConstructor< CVQual TensorSelectOp<OrigIfExpr, OrigThenExpr, OrigElseExpr>, CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Params...> {\
|
||||
typedef ExprConstructor<OrigIfExpr, IfExpr, Params...> my_if_type;\
|
||||
typedef ExprConstructor<OrigThenExpr, ThenExpr, Params...> my_then_type;\
|
||||
typedef ExprConstructor<OrigElseExpr, ElseExpr, Params...> my_else_type;\
|
||||
typedef CVQual TensorSelectOp<typename my_if_type::Type, typename my_then_type::Type, typename my_else_type::Type> Type;\
|
||||
my_if_type ifExpr;\
|
||||
my_then_type thenExpr;\
|
||||
my_else_type elseExpr;\
|
||||
Type expr;\
|
||||
template <typename FuncDetector>\
|
||||
ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
|
||||
: ifExpr(funcD.ifExpr, t), thenExpr(funcD.thenExpr, t), elseExpr(funcD.elseExpr, t), expr(ifExpr.expr, thenExpr.expr, elseExpr.expr) {}\
|
||||
};
|
||||
|
||||
SELECTOP(const)
|
||||
SELECTOP()
|
||||
#undef SELECTOP
|
||||
|
||||
/// specialisation of the \ref ExprConstructor struct when the node type is
|
||||
/// const TensorAssignOp
|
||||
#define ASSIGN(CVQual)\
|
||||
template <typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr, typename RHSExpr, typename... Params>\
|
||||
struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>, CVQual TensorAssignOp<LHSExpr, RHSExpr>, Params...> {\
|
||||
typedef ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\
|
||||
typedef ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\
|
||||
typedef CVQual TensorAssignOp<typename my_left_type::Type, typename my_right_type::Type> Type;\
|
||||
my_left_type lhsExpr;\
|
||||
my_right_type rhsExpr;\
|
||||
Type expr;\
|
||||
template <typename FuncDetector>\
|
||||
ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
|
||||
: lhsExpr(funcD.lhsExpr, t), rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr) {}\
|
||||
};
|
||||
|
||||
ASSIGN(const)
|
||||
ASSIGN()
|
||||
#undef ASSIGN
|
||||
/// specialisation of the \ref ExprConstructor struct when the node type is
|
||||
/// TensorEvalToOp
|
||||
#define EVALTO(CVQual)\
|
||||
template <typename OrigExpr, typename Expr, typename... Params>\
|
||||
struct ExprConstructor<CVQual TensorEvalToOp<OrigExpr, MakeGlobalPointer>, CVQual TensorEvalToOp<Expr>, Params...> {\
|
||||
typedef ExprConstructor<OrigExpr, Expr, Params...> my_expr_type;\
|
||||
typedef typename TensorEvalToOp<OrigExpr, MakeGlobalPointer>::PointerType my_buffer_type;\
|
||||
typedef CVQual TensorEvalToOp<typename my_expr_type::Type, MakeGlobalPointer> Type;\
|
||||
my_expr_type nestedExpression;\
|
||||
EvalToLHSConstructor<my_buffer_type, 0, Params...> buffer;\
|
||||
Type expr;\
|
||||
template <typename FuncDetector>\
|
||||
ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
|
||||
: nestedExpression(funcD.rhsExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\
|
||||
};
|
||||
|
||||
EVALTO(const)
|
||||
EVALTO()
|
||||
#undef EVALTO
|
||||
|
||||
/// specialisation of the \ref ExprConstructor struct when the node type is
|
||||
/// TensorForcedEvalOp
|
||||
#define FORCEDEVAL(CVQual)\
|
||||
template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
|
||||
struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr, MakeGlobalPointer>,\
|
||||
CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
|
||||
typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::Scalar,\
|
||||
TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::NumDimensions, 0, typename TensorForcedEvalOp<DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
|
||||
Type expr;\
|
||||
template <typename FuncDetector>\
|
||||
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
|
||||
: expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
|
||||
};
|
||||
|
||||
FORCEDEVAL(const)
|
||||
FORCEDEVAL()
|
||||
#undef FORCEDEVAL
|
||||
|
||||
template <bool Conds, size_t X , size_t Y > struct ValueCondition {
|
||||
static const size_t Res =X;
|
||||
};
|
||||
template<size_t X, size_t Y> struct ValueCondition<false, X , Y> {
|
||||
static const size_t Res =Y;
|
||||
};
|
||||
|
||||
/// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp
|
||||
#define SYCLREDUCTIONEXPR(CVQual)\
|
||||
template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
|
||||
struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\
|
||||
CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
|
||||
static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
|
||||
typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
|
||||
NumIndices, 0, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
|
||||
Type expr;\
|
||||
template <typename FuncDetector>\
|
||||
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
|
||||
: expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
|
||||
};
|
||||
|
||||
SYCLREDUCTIONEXPR(const)
|
||||
SYCLREDUCTIONEXPR()
|
||||
#undef SYCLREDUCTIONEXPR
|
||||
|
||||
/// template deduction for \ref ExprConstructor struct
|
||||
template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params>
|
||||
auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple<Params...> &t)
|
||||
-> decltype(ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t)) {
|
||||
return ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t);
|
||||
}
|
||||
|
||||
} /// namespace TensorSycl
|
||||
} /// namespace internal
|
||||
} /// namespace Eigen
|
||||
|
||||
|
||||
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
|
|
@ -0,0 +1,204 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
/*****************************************************************
|
||||
* TensorSyclExtractAccessor.h
|
||||
*
|
||||
* \brief:
|
||||
* ExtractAccessor takes Expression placeHolder expression and the tuple of sycl
|
||||
* buffers as an input. Using pre-order tree traversal, ExtractAccessor
|
||||
* recursively calls itself for its children in the expression tree. The
|
||||
* leaf node in the PlaceHolder expression is nothing but a container preserving
|
||||
* the order of the actual data in the tuple of sycl buffer. By invoking the
|
||||
* extract accessor for the PlaceHolder<N>, an accessor is created for the Nth
|
||||
* buffer in the tuple of buffers. This accessor is then added as an Nth
|
||||
* element in the tuple of accessors. In this case we preserve the order of data
|
||||
* in the expression tree.
|
||||
*
|
||||
* This is the specialisation of extract accessor method for different operation
|
||||
* type in the PlaceHolder expression.
|
||||
*
|
||||
*****************************************************************/
|
||||
|
||||
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
|
||||
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
|
||||
|
||||
namespace Eigen {
|
||||
namespace TensorSycl {
|
||||
namespace internal {
|
||||
/// struct ExtractAccessor: Extract Accessor Class is used to extract the
|
||||
/// accessor from a buffer.
|
||||
/// Depending on the type of the leaf node we can get a read accessor or a
|
||||
/// read_write accessor
|
||||
template <typename Evaluator>
|
||||
struct ExtractAccessor;
|
||||
|
||||
struct AccessorConstructor{
|
||||
template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, Arg eval)
|
||||
-> decltype(ExtractAccessor<Arg>::getTuple(cgh, eval)) {
|
||||
return ExtractAccessor<Arg>::getTuple(cgh, eval);
|
||||
}
|
||||
|
||||
template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1, Arg2 eval2)
|
||||
-> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) {
|
||||
return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2));
|
||||
}
|
||||
template<typename Arg1, typename Arg2, typename Arg3> static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1 , Arg2 eval2 , Arg3 eval3)
|
||||
-> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) {
|
||||
return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)));
|
||||
}
|
||||
template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, Arg eval)
|
||||
-> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM,
|
||||
typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()))){
|
||||
return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM, typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()));
|
||||
}
|
||||
};
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is
|
||||
/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp and const TensorBroadcastingOp
|
||||
template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
|
||||
struct ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> eval)
|
||||
-> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){
|
||||
return AccessorConstructor::getTuple(cgh, eval.impl());
|
||||
}
|
||||
};
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseNullaryOp, TensorCwiseUnaryOp and TensorBroadcastingOp
|
||||
template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
|
||||
struct ExtractAccessor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
|
||||
: ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {};
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorCwiseBinaryOp
|
||||
template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
|
||||
struct ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> eval)
|
||||
-> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){
|
||||
return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
|
||||
}
|
||||
};
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp
|
||||
template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
|
||||
struct ExtractAccessor<TensorEvaluator<BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >
|
||||
: ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >{};
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is
|
||||
/// const TensorCwiseTernaryOp
|
||||
template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
|
||||
struct ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> eval)
|
||||
-> decltype(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl())){
|
||||
return AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl());
|
||||
}
|
||||
};
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseTernaryOp
|
||||
template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
|
||||
struct ExtractAccessor<TensorEvaluator<TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
|
||||
: ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is
|
||||
/// const TensorCwiseSelectOp. This is a special case where there is no OP
|
||||
template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
|
||||
struct ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> eval)
|
||||
-> decltype(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl())){
|
||||
return AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl());
|
||||
}
|
||||
};
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is
|
||||
/// TensorCwiseSelectOp. This is a special case where there is no OP
|
||||
template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
|
||||
struct ExtractAccessor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
|
||||
: ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >{};
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorAssignOp
|
||||
template <typename LHSExpr, typename RHSExpr, typename Dev>
|
||||
struct ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> eval)
|
||||
-> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){
|
||||
return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
|
||||
}
|
||||
};
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorAssignOp
|
||||
template <typename LHSExpr, typename RHSExpr, typename Dev>
|
||||
struct ExtractAccessor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
|
||||
: ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorMap
|
||||
#define TENSORMAPEXPR(CVQual, ACCType)\
|
||||
template <typename PlainObjectType, int Options_, typename Dev>\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> eval)\
|
||||
-> decltype(AccessorConstructor::template getAccessor<ACCType>(cgh, eval)){\
|
||||
return AccessorConstructor::template getAccessor<ACCType>(cgh, eval);\
|
||||
}\
|
||||
};
|
||||
TENSORMAPEXPR(const, cl::sycl::access::mode::read)
|
||||
TENSORMAPEXPR(, cl::sycl::access::mode::read_write)
|
||||
#undef TENSORMAPEXPR
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorForcedEvalOp
|
||||
template <typename Expr, typename Dev>
|
||||
struct ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> > {
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> eval)
|
||||
-> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
|
||||
return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
|
||||
}
|
||||
};
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp
|
||||
template <typename Expr, typename Dev>
|
||||
struct ExtractAccessor<TensorEvaluator<TensorForcedEvalOp<Expr>, Dev> >
|
||||
: ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> >{};
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorEvalToOp
|
||||
template <typename Expr, typename Dev>
|
||||
struct ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> > {
|
||||
static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<const TensorEvalToOp<Expr>, Dev> eval)
|
||||
-> decltype(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){
|
||||
return utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()));
|
||||
}
|
||||
};
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
|
||||
template <typename Expr, typename Dev>
|
||||
struct ExtractAccessor<TensorEvaluator<TensorEvalToOp<Expr>, Dev> >
|
||||
: ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> >{};
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorReductionOp
|
||||
template <typename OP, typename Dim, typename Expr, typename Dev>
|
||||
struct ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> > {
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> eval)
|
||||
-> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
|
||||
return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
|
||||
}
|
||||
};
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
|
||||
template <typename OP, typename Dim, typename Expr, typename Dev>
|
||||
struct ExtractAccessor<TensorEvaluator<TensorReductionOp<OP, Dim, Expr>, Dev> >
|
||||
: ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> >{};
|
||||
|
||||
/// template deduction for \ref ExtractAccessor
|
||||
template <typename Evaluator>
|
||||
auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& expr)
|
||||
-> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, expr)) {
|
||||
return ExtractAccessor<Evaluator>::getTuple(cgh, expr);
|
||||
}
|
||||
|
||||
} /// namespace TensorSycl
|
||||
} /// namespace internal
|
||||
} /// namespace Eigen
|
||||
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
|
|
@ -0,0 +1,177 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
/*****************************************************************
|
||||
* TensorSyclextractFunctors.h
|
||||
*
|
||||
* \brief:
|
||||
* Used to extract all the functors allocated to each node of the expression
|
||||
*tree.
|
||||
*
|
||||
*****************************************************************/
|
||||
|
||||
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
|
||||
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
|
||||
|
||||
namespace Eigen {
|
||||
namespace TensorSycl {
|
||||
namespace internal {
|
||||
/// struct FunctorExtractor: This struct is used to extract the functors
|
||||
/// constructed on
|
||||
/// the host-side, to pack them and reuse them in reconstruction of the
|
||||
/// expression on the device.
|
||||
/// We have to do that as in Eigen the functors are not stateless so we cannot
|
||||
/// re-instantiate them on the device.
|
||||
/// We have to pass instantiated functors to the device.
|
||||
// This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval).
|
||||
template <typename Evaluator> struct FunctorExtractor{
|
||||
typedef typename Evaluator::Dimensions Dimensions;
|
||||
const Dimensions m_dimensions;
|
||||
const Dimensions& dimensions() const { return m_dimensions; }
|
||||
FunctorExtractor(const Evaluator& expr)
|
||||
: m_dimensions(expr.dimensions()) {}
|
||||
|
||||
};
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is
|
||||
/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp, and const TensorBroadcastingOp
|
||||
template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
|
||||
struct FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {
|
||||
FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
|
||||
OP func;
|
||||
FunctorExtractor(const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev>& expr)
|
||||
: rhsExpr(expr.impl()), func(expr.functor()) {}
|
||||
};
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is
|
||||
/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, and TensorBroadcastingOp
|
||||
template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
|
||||
struct FunctorExtractor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
|
||||
: FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> >{};
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is
|
||||
/// const TensorCwiseBinaryOp
|
||||
template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
|
||||
struct FunctorExtractor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
|
||||
FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;
|
||||
FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
|
||||
OP func;
|
||||
FunctorExtractor(const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)
|
||||
: lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}
|
||||
};
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is
|
||||
/// const TensorCwiseBinaryOp
|
||||
template <template <class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
|
||||
struct FunctorExtractor<TensorEvaluator<BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >
|
||||
: FunctorExtractor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >{};
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is
|
||||
/// const TensorCwiseTernaryOp
|
||||
template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev>
|
||||
struct FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {
|
||||
FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;
|
||||
FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;
|
||||
FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;
|
||||
OP func;
|
||||
FunctorExtractor(const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)
|
||||
: arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}
|
||||
};
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is
|
||||
/// TensorCwiseTernaryOp
|
||||
template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
|
||||
struct FunctorExtractor<TensorEvaluator< TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
|
||||
:FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is
|
||||
/// const TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
|
||||
template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
|
||||
struct FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {
|
||||
FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr;
|
||||
FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr;
|
||||
FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr;
|
||||
FunctorExtractor(const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr)
|
||||
: ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {}
|
||||
};
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is
|
||||
/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated
|
||||
template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
|
||||
struct FunctorExtractor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
|
||||
:FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {};
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is
|
||||
/// const TensorAssignOp. This is an specialisation without OP so it has to be separated.
|
||||
template <typename LHSExpr, typename RHSExpr, typename Dev>
|
||||
struct FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
|
||||
FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;
|
||||
FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
|
||||
FunctorExtractor(const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr)
|
||||
: lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {}
|
||||
};
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is
|
||||
/// TensorAssignOp. This is an specialisation without OP so it has to be separated.
|
||||
template <typename LHSExpr, typename RHSExpr, typename Dev>
|
||||
struct FunctorExtractor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
|
||||
:FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};
|
||||
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is
|
||||
/// const TensorEvalToOp, This is an specialisation without OP so it has to be separated.
|
||||
template <typename RHSExpr, typename Dev>
|
||||
struct FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {
|
||||
FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
|
||||
FunctorExtractor(const TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev>& expr)
|
||||
: rhsExpr(expr.impl()) {}
|
||||
};
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is
|
||||
/// TensorEvalToOp. This is a specialisation without OP so it has to be separated.
|
||||
template <typename RHSExpr, typename Dev>
|
||||
struct FunctorExtractor<TensorEvaluator<TensorEvalToOp<RHSExpr>, Dev> >
|
||||
: FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {};
|
||||
|
||||
template<typename Dim, size_t NumOutputDim> struct DimConstr {
|
||||
template<typename InDim>
|
||||
static inline Dim getDim(InDim dims ) {return dims;}
|
||||
};
|
||||
|
||||
template<typename Dim> struct DimConstr<Dim, 0> {
|
||||
template<typename InDim>
|
||||
static inline Dim getDim(InDim dims ) {return Dim(dims.TotalSize());}
|
||||
};
|
||||
|
||||
template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
|
||||
struct FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{
|
||||
typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;
|
||||
typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;
|
||||
const Dimensions m_dimensions;
|
||||
const Dimensions& dimensions() const { return m_dimensions; }
|
||||
FunctorExtractor(const TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)
|
||||
: m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}
|
||||
};
|
||||
|
||||
|
||||
template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
|
||||
struct FunctorExtractor<TensorEvaluator<TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>
|
||||
: FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{};
|
||||
/// template deduction function for FunctorExtractor
|
||||
template <typename Evaluator>
|
||||
auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {
|
||||
return FunctorExtractor<Evaluator>(evaluator);
|
||||
}
|
||||
} // namespace internal
|
||||
} // namespace TensorSycl
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
|
|
@ -0,0 +1,114 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
/*****************************************************************
|
||||
* TensorSyclLeafCount.h
|
||||
*
|
||||
* \brief:
|
||||
* The leaf count used the pre-order expression tree traverse in order to name
|
||||
* count the number of leaf nodes in the expression
|
||||
*
|
||||
*****************************************************************/
|
||||
|
||||
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
|
||||
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
|
||||
|
||||
namespace Eigen {
|
||||
namespace TensorSycl {
|
||||
namespace internal {
|
||||
/// \brief LeafCount used to counting terminal nodes. The total number of
|
||||
/// leaf nodes is used by MakePlaceHolderExprHelper to find the order
|
||||
/// of the leaf node in a expression tree at compile time.
|
||||
template <typename Expr>
|
||||
struct LeafCount;
|
||||
|
||||
template<typename... Args> struct CategoryCount;
|
||||
|
||||
template<> struct CategoryCount<>
|
||||
{
|
||||
static const size_t Count =0;
|
||||
};
|
||||
|
||||
template<typename Arg, typename... Args>
|
||||
struct CategoryCount<Arg,Args...>{
|
||||
static const size_t Count = LeafCount<Arg>::Count + CategoryCount<Args...>::Count;
|
||||
};
|
||||
|
||||
/// specialisation of the \ref LeafCount struct when the node type is const TensorMap
|
||||
template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
|
||||
struct LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> > {
|
||||
static const size_t Count =1;
|
||||
};
|
||||
|
||||
/// specialisation of the \ref LeafCount struct when the node type is TensorMap
|
||||
template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
|
||||
struct LeafCount<TensorMap<PlainObjectType, Options_, MakePointer_> > :LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> >{};
|
||||
|
||||
// const TensorCwiseUnaryOp, const TensorCwiseNullaryOp, const TensorCwiseBinaryOp, const TensorCwiseTernaryOp, and Const TensorBroadcastingOp
|
||||
template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>
|
||||
struct LeafCount<const CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {};
|
||||
// TensorCwiseUnaryOp, TensorCwiseNullaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, and TensorBroadcastingOp
|
||||
template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>
|
||||
struct LeafCount<CategoryExpr<OP, RHSExpr...> > :LeafCount<const CategoryExpr<OP, RHSExpr...> >{};
|
||||
|
||||
/// specialisation of the \ref LeafCount struct when the node type is const TensorSelectOp is an exception
|
||||
template <typename IfExpr, typename ThenExpr, typename ElseExpr>
|
||||
struct LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {};
|
||||
/// specialisation of the \ref LeafCount struct when the node type is TensorSelectOp
|
||||
template <typename IfExpr, typename ThenExpr, typename ElseExpr>
|
||||
struct LeafCount<TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >: LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > {};
|
||||
|
||||
|
||||
/// specialisation of the \ref LeafCount struct when the node type is const TensorAssignOp
|
||||
template <typename LHSExpr, typename RHSExpr>
|
||||
struct LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {};
|
||||
|
||||
/// specialisation of the \ref LeafCount struct when the node type is
|
||||
/// TensorAssignOp is an exception. It is not the same as Unary
|
||||
template <typename LHSExpr, typename RHSExpr>
|
||||
struct LeafCount<TensorAssignOp<LHSExpr, RHSExpr> > :LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >{};
|
||||
|
||||
/// specialisation of the \ref LeafCount struct when the node type is const TensorForcedEvalOp
|
||||
template <typename Expr>
|
||||
struct LeafCount<const TensorForcedEvalOp<Expr> > {
|
||||
static const size_t Count =1;
|
||||
};
|
||||
|
||||
/// specialisation of the \ref LeafCount struct when the node type is TensorForcedEvalOp
|
||||
template <typename Expr>
|
||||
struct LeafCount<TensorForcedEvalOp<Expr> >: LeafCount<const TensorForcedEvalOp<Expr> > {};
|
||||
|
||||
/// specialisation of the \ref LeafCount struct when the node type is const TensorEvalToOp
|
||||
template <typename Expr>
|
||||
struct LeafCount<const TensorEvalToOp<Expr> > {
|
||||
static const size_t Count = 1 + CategoryCount<Expr>::Count;
|
||||
};
|
||||
|
||||
/// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp
|
||||
template <typename OP, typename Dim, typename Expr>
|
||||
struct LeafCount<const TensorReductionOp<OP, Dim, Expr> > {
|
||||
static const size_t Count =1;
|
||||
};
|
||||
|
||||
/// specialisation of the \ref LeafCount struct when the node type is TensorReductionOp
|
||||
template <typename OP, typename Dim, typename Expr>
|
||||
struct LeafCount<TensorReductionOp<OP, Dim, Expr> >: LeafCount<const TensorReductionOp<OP, Dim, Expr> >{};
|
||||
|
||||
/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
|
||||
template <typename Expr>
|
||||
struct LeafCount<TensorEvalToOp<Expr> >: LeafCount<const TensorEvalToOp<Expr> >{};
|
||||
|
||||
} /// namespace TensorSycl
|
||||
} /// namespace internal
|
||||
} /// namespace Eigen
|
||||
|
||||
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
|
|
@ -0,0 +1,181 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
/*****************************************************************
|
||||
* TensorSyclPlaceHolderExpr.h
|
||||
*
|
||||
* \brief:
|
||||
* This is the specialisation of the placeholder expression based on the
|
||||
* operation type
|
||||
*
|
||||
*****************************************************************/
|
||||
|
||||
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
|
||||
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
|
||||
|
||||
namespace Eigen {
|
||||
namespace TensorSycl {
|
||||
namespace internal {
|
||||
|
||||
/// \struct PlaceHolder
|
||||
/// \brief PlaceHolder is used to replace the \ref TensorMap in the expression
|
||||
/// tree.
|
||||
/// PlaceHolder contains the order of the leaf node in the expression tree.
|
||||
template <typename Scalar, size_t N>
|
||||
struct PlaceHolder {
|
||||
static constexpr size_t I = N;
|
||||
typedef Scalar Type;
|
||||
};
|
||||
|
||||
/// \sttruct PlaceHolderExpression
|
||||
/// \brief it is used to create the PlaceHolder expression. The PlaceHolder
|
||||
/// expression is a copy of expression type in which the TensorMap of the has
|
||||
/// been replaced with PlaceHolder.
|
||||
template <typename Expr, size_t N>
|
||||
struct PlaceHolderExpression;
|
||||
|
||||
template<size_t N, typename... Args>
|
||||
struct CalculateIndex;
|
||||
|
||||
template<size_t N, typename Arg>
|
||||
struct CalculateIndex<N, Arg>{
|
||||
typedef typename PlaceHolderExpression<Arg, N>::Type ArgType;
|
||||
typedef utility::tuple::Tuple<ArgType> ArgsTuple;
|
||||
};
|
||||
|
||||
template<size_t N, typename Arg1, typename Arg2>
|
||||
struct CalculateIndex<N, Arg1, Arg2>{
|
||||
static const size_t Arg2LeafCount = LeafCount<Arg2>::Count;
|
||||
typedef typename PlaceHolderExpression<Arg1, N - Arg2LeafCount>::Type Arg1Type;
|
||||
typedef typename PlaceHolderExpression<Arg2, N>::Type Arg2Type;
|
||||
typedef utility::tuple::Tuple<Arg1Type, Arg2Type> ArgsTuple;
|
||||
};
|
||||
|
||||
template<size_t N, typename Arg1, typename Arg2, typename Arg3>
|
||||
struct CalculateIndex<N, Arg1, Arg2, Arg3> {
|
||||
static const size_t Arg3LeafCount = LeafCount<Arg3>::Count;
|
||||
static const size_t Arg2LeafCount = LeafCount<Arg2>::Count;
|
||||
typedef typename PlaceHolderExpression<Arg1, N - Arg3LeafCount - Arg2LeafCount>::Type Arg1Type;
|
||||
typedef typename PlaceHolderExpression<Arg2, N - Arg3LeafCount>::Type Arg2Type;
|
||||
typedef typename PlaceHolderExpression<Arg3, N>::Type Arg3Type;
|
||||
typedef utility::tuple::Tuple<Arg1Type, Arg2Type, Arg3Type> ArgsTuple;
|
||||
};
|
||||
|
||||
template<template<class...> class Category , class OP, class TPL>
|
||||
struct CategoryHelper;
|
||||
|
||||
template<template<class...> class Category , class OP, class ...T >
|
||||
struct CategoryHelper<Category, OP, utility::tuple::Tuple<T...> > {
|
||||
typedef Category<OP, T... > Type;
|
||||
};
|
||||
|
||||
template<template<class...> class Category , class ...T >
|
||||
struct CategoryHelper<Category, NoOP, utility::tuple::Tuple<T...> > {
|
||||
typedef Category<T... > Type;
|
||||
};
|
||||
|
||||
/// specialisation of the \ref PlaceHolderExpression when the node is
|
||||
/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorBroadcastingOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp
|
||||
#define OPEXPRCATEGORY(CVQual)\
|
||||
template <template <class, class... > class Category, typename OP, typename... SubExpr, size_t N>\
|
||||
struct PlaceHolderExpression<CVQual Category<OP, SubExpr...>, N>{\
|
||||
typedef CVQual typename CategoryHelper<Category, OP, typename CalculateIndex<N, SubExpr...>::ArgsTuple>::Type Type;\
|
||||
};
|
||||
|
||||
OPEXPRCATEGORY(const)
|
||||
OPEXPRCATEGORY()
|
||||
#undef OPEXPRCATEGORY
|
||||
|
||||
/// specialisation of the \ref PlaceHolderExpression when the node is
|
||||
/// TensorCwiseSelectOp
|
||||
#define SELECTEXPR(CVQual)\
|
||||
template <typename IfExpr, typename ThenExpr, typename ElseExpr, size_t N>\
|
||||
struct PlaceHolderExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, N> {\
|
||||
typedef CVQual typename CategoryHelper<TensorSelectOp, NoOP, typename CalculateIndex<N, IfExpr, ThenExpr, ElseExpr>::ArgsTuple>::Type Type;\
|
||||
};
|
||||
|
||||
SELECTEXPR(const)
|
||||
SELECTEXPR()
|
||||
#undef SELECTEXPR
|
||||
|
||||
/// specialisation of the \ref PlaceHolderExpression when the node is
|
||||
/// TensorAssignOp
|
||||
#define ASSIGNEXPR(CVQual)\
|
||||
template <typename LHSExpr, typename RHSExpr, size_t N>\
|
||||
struct PlaceHolderExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr>, N> {\
|
||||
typedef CVQual typename CategoryHelper<TensorAssignOp, NoOP, typename CalculateIndex<N, LHSExpr, RHSExpr>::ArgsTuple>::Type Type;\
|
||||
};
|
||||
|
||||
ASSIGNEXPR(const)
|
||||
ASSIGNEXPR()
|
||||
#undef ASSIGNEXPR
|
||||
|
||||
/// specialisation of the \ref PlaceHolderExpression when the node is
|
||||
/// TensorMap
|
||||
#define TENSORMAPEXPR(CVQual)\
|
||||
template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_, size_t N>\
|
||||
struct PlaceHolderExpression< CVQual TensorMap< Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> {\
|
||||
typedef CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> Type;\
|
||||
};
|
||||
|
||||
TENSORMAPEXPR(const)
|
||||
TENSORMAPEXPR()
|
||||
#undef TENSORMAPEXPR
|
||||
|
||||
/// specialisation of the \ref PlaceHolderExpression when the node is
|
||||
/// TensorForcedEvalOp
|
||||
#define FORCEDEVAL(CVQual)\
|
||||
template <typename Expr, size_t N>\
|
||||
struct PlaceHolderExpression<CVQual TensorForcedEvalOp<Expr>, N> {\
|
||||
typedef CVQual PlaceHolder<CVQual TensorForcedEvalOp<Expr>, N> Type;\
|
||||
};
|
||||
|
||||
FORCEDEVAL(const)
|
||||
FORCEDEVAL()
|
||||
#undef FORCEDEVAL
|
||||
|
||||
/// specialisation of the \ref PlaceHolderExpression when the node is
|
||||
/// TensorEvalToOp
|
||||
#define EVALTO(CVQual)\
|
||||
template <typename Expr, size_t N>\
|
||||
struct PlaceHolderExpression<CVQual TensorEvalToOp<Expr>, N> {\
|
||||
typedef CVQual TensorEvalToOp<typename CalculateIndex <N, Expr>::ArgType> Type;\
|
||||
};
|
||||
|
||||
EVALTO(const)
|
||||
EVALTO()
|
||||
#undef EVALTO
|
||||
|
||||
|
||||
/// specialisation of the \ref PlaceHolderExpression when the node is
|
||||
/// TensorReductionOp
|
||||
#define SYCLREDUCTION(CVQual)\
|
||||
template <typename OP, typename Dims, typename Expr, size_t N>\
|
||||
struct PlaceHolderExpression<CVQual TensorReductionOp<OP, Dims, Expr>, N>{\
|
||||
typedef CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dims,Expr>, N> Type;\
|
||||
};
|
||||
SYCLREDUCTION(const)
|
||||
SYCLREDUCTION()
|
||||
#undef SYCLREDUCTION
|
||||
|
||||
/// template deduction for \ref PlaceHolderExpression struct
|
||||
template <typename Expr>
|
||||
struct createPlaceHolderExpression {
|
||||
static const size_t TotalLeaves = LeafCount<Expr>::Count;
|
||||
typedef typename PlaceHolderExpression<Expr, TotalLeaves - 1>::Type Type;
|
||||
};
|
||||
|
||||
} // internal
|
||||
} // TensorSycl
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
|
|
@ -0,0 +1,70 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Cummins Chris PhD student at The University of Edinburgh.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
/*****************************************************************
|
||||
* TensorSyclRun.h
|
||||
*
|
||||
* \brief:
|
||||
* Schedule_kernel invoke an specialised version of kernel struct. The
|
||||
* specialisation is based on the data dimension in sycl buffer
|
||||
*
|
||||
*****************************************************************/
|
||||
|
||||
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
|
||||
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
|
||||
|
||||
namespace Eigen {
|
||||
namespace TensorSycl {
|
||||
/// The run function in tensor sycl convert the expression tree to a buffer
|
||||
/// based expression tree;
|
||||
/// creates the expression tree for the device with accessor to buffers;
|
||||
/// construct the kernel and submit it to the sycl queue.
|
||||
template <typename Expr, typename Dev>
|
||||
void run(Expr &expr, Dev &dev) {
|
||||
Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign) {
|
||||
typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
|
||||
auto functors = internal::extractFunctors(evaluator);
|
||||
|
||||
size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
|
||||
dev.m_queue.submit([&](cl::sycl::handler &cgh) {
|
||||
|
||||
// create a tuple of accessors from Evaluator
|
||||
auto tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator);
|
||||
const auto range = utility::tuple::get<0>(tuple_of_accessors).get_range()[0];
|
||||
size_t GRange=range;
|
||||
if (tileSize>GRange) tileSize=GRange;
|
||||
else if(GRange>tileSize){
|
||||
size_t xMode = GRange % tileSize;
|
||||
if (xMode != 0) GRange += (tileSize - xMode);
|
||||
}
|
||||
// run the kernel
|
||||
cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
|
||||
typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
|
||||
auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||
auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
|
||||
if (itemID.get_global_linear_id() < range) {
|
||||
device_evaluator.evalScalar(static_cast<int>(itemID.get_global_linear_id()));
|
||||
}
|
||||
});
|
||||
});
|
||||
dev.m_queue.throw_asynchronous();
|
||||
}
|
||||
|
||||
evaluator.cleanup();
|
||||
}
|
||||
} // namespace TensorSycl
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
|
|
@ -0,0 +1,237 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
/*****************************************************************
|
||||
* TensroSyclTuple.h
|
||||
*
|
||||
* \brief:
|
||||
* Minimal implementation of std::tuple that can be used inside a SYCL kernel.
|
||||
*
|
||||
*****************************************************************/
|
||||
|
||||
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
|
||||
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
|
||||
namespace utility {
|
||||
namespace tuple {
|
||||
/// \struct StaticIf
|
||||
/// \brief The StaticIf struct is used to statically choose the type based on the
|
||||
/// condition.
|
||||
template <bool, typename T = void> struct StaticIf;
|
||||
/// \brief specialisation of the \ref StaticIf when the condition is true
|
||||
template <typename T>
|
||||
struct StaticIf<true, T> {
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
/// \struct Tuple
|
||||
/// \brief is a fixed-size collection of heterogeneous values
|
||||
/// \tparam Ts... - the types of the elements that the tuple stores.
|
||||
/// Empty list is supported.
|
||||
template <class... Ts>
|
||||
struct Tuple {};
|
||||
|
||||
/// \brief specialisation of the \ref Tuple class when the tuple has at least
|
||||
/// one element.
|
||||
/// \tparam T : the type of the first element in the tuple.
|
||||
/// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty.
|
||||
template <class T, class... Ts>
|
||||
struct Tuple<T, Ts...> {
|
||||
Tuple(T t, Ts... ts) : head(t), tail(ts...) {}
|
||||
T head;
|
||||
Tuple<Ts...> tail;
|
||||
};
|
||||
|
||||
///\ struct ElemTypeHolder
|
||||
/// \brief ElemTypeHolder class is used to specify the types of the
|
||||
/// elements inside the tuple
|
||||
/// \tparam size_t the number of elements inside the tuple
|
||||
/// \tparam class the tuple class
|
||||
template <size_t, class>
|
||||
struct ElemTypeHolder;
|
||||
|
||||
/// \brief specialisation of the \ref ElemTypeHolder class when the number of
|
||||
/// elements inside the tuple is 1
|
||||
template <class T, class... Ts>
|
||||
struct ElemTypeHolder<0, Tuple<T, Ts...> > {
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
/// \brief specialisation of the \ref ElemTypeHolder class when the number of
|
||||
/// elements inside the tuple is bigger than 1. It recursively calls itself to
|
||||
/// detect the type of each element in the tuple
|
||||
/// \tparam T : the type of the first element in the tuple.
|
||||
/// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty.
|
||||
/// \tparam K is the Kth element in the tuple
|
||||
template <size_t k, class T, class... Ts>
|
||||
struct ElemTypeHolder<k, Tuple<T, Ts...> > {
|
||||
typedef typename ElemTypeHolder<k - 1, Tuple<Ts...> >::type type;
|
||||
};
|
||||
|
||||
/// get
|
||||
/// \brief Extracts the first element from the tuple.
|
||||
/// K=0 represents the first element of the tuple. The tuple cannot be empty.
|
||||
/// \tparam Ts... are the type of the elements in the tuple.
|
||||
/// \param t is the tuple whose contents to extract
|
||||
/// \return typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type
|
||||
|
||||
#define TERMINATE_CONDS_TUPLE_GET(CVQual) \
|
||||
template <size_t k, class... Ts> \
|
||||
typename StaticIf<k == 0, CVQual typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type \
|
||||
get(CVQual Tuple<Ts...> &t) { \
|
||||
static_assert(sizeof...(Ts)!=0, "The requseted value is bigger than the size of the tuple"); \
|
||||
return t.head; \
|
||||
}
|
||||
|
||||
TERMINATE_CONDS_TUPLE_GET(const)
|
||||
TERMINATE_CONDS_TUPLE_GET()
|
||||
#undef TERMINATE_CONDS_TUPLE_GET
|
||||
/// get
|
||||
/// \brief Extracts the Kth element from the tuple.
|
||||
///\tparam K is an integer value in [0,sizeof...(Types)).
|
||||
/// \tparam T is the (sizeof...(Types) -(K+1)) element in the tuple
|
||||
/// \tparam Ts... are the type of the elements in the tuple.
|
||||
/// \param t is the tuple whose contents to extract
|
||||
/// \return typename ElemTypeHolder<K, Tuple<Ts...> >::type &>::type
|
||||
#define RECURSIVE_TUPLE_GET(CVQual) \
|
||||
template <size_t k, class T, class... Ts> \
|
||||
typename StaticIf<k != 0, CVQual typename ElemTypeHolder<k, Tuple<T, Ts...> >::type &>::type \
|
||||
get(CVQual Tuple<T, Ts...> &t) { \
|
||||
return utility::tuple::get<k - 1>(t.tail); \
|
||||
}
|
||||
RECURSIVE_TUPLE_GET(const)
|
||||
RECURSIVE_TUPLE_GET()
|
||||
#undef RECURSIVE_TUPLE_GET
|
||||
|
||||
/// make_tuple
|
||||
/// \brief Creates a tuple object, deducing the target type from the types of
|
||||
/// arguments.
|
||||
/// \tparam Args the type of the arguments to construct the tuple from
|
||||
/// \param args zero or more arguments to construct the tuple from
|
||||
/// \return Tuple<Args...>
|
||||
template <typename... Args>
|
||||
Tuple<Args...> make_tuple(Args... args) {
|
||||
return Tuple<Args...>(args...);
|
||||
}
|
||||
|
||||
/// size
|
||||
/// \brief Provides access to the number of elements in a tuple as a
|
||||
/// compile-time constant expression.
|
||||
/// \tparam Args the type of the arguments to construct the tuple from
|
||||
/// \return size_t
|
||||
template <typename... Args>
|
||||
static constexpr size_t size(Tuple<Args...> &) {
|
||||
return sizeof...(Args);
|
||||
}
|
||||
|
||||
/// \struct IndexList
|
||||
/// \brief Creates a list of index from the elements in the tuple
|
||||
/// \tparam Is... a list of index from [0 to sizeof...(tuple elements))
|
||||
template <size_t... Is>
|
||||
struct IndexList {};
|
||||
|
||||
/// \struct RangeBuilder
|
||||
/// \brief Collects internal details for generating index ranges [MIN, MAX)
|
||||
/// Declare primary template for index range builder
|
||||
/// \tparam MIN is the starting index in the tuple
|
||||
/// \tparam N represents sizeof..(elemens)- sizeof...(Is)
|
||||
/// \tparam Is... are the list of generated index so far
|
||||
template <size_t MIN, size_t N, size_t... Is>
|
||||
struct RangeBuilder;
|
||||
|
||||
// FIXME Doxygen has problems with recursive inheritance
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
/// \brief base Step: Specialisation of the \ref RangeBuilder when the
|
||||
/// MIN==MAX. In this case the Is... is [0 to sizeof...(tuple elements))
|
||||
/// \tparam MIN is the starting index of the tuple
|
||||
/// \tparam Is is [0 to sizeof...(tuple elements))
|
||||
template <size_t MIN, size_t... Is>
|
||||
struct RangeBuilder<MIN, MIN, Is...> {
|
||||
typedef IndexList<Is...> type;
|
||||
};
|
||||
|
||||
/// Induction step: Specialisation of the RangeBuilder class when N!=MIN
|
||||
/// in this case we are recursively subtracting N by one and adding one
|
||||
/// index to Is... list until MIN==N
|
||||
/// \tparam MIN is the starting index in the tuple
|
||||
/// \tparam N represents sizeof..(elemens)- sizeof...(Is)
|
||||
/// \tparam Is... are the list of generated index so far
|
||||
template <size_t MIN, size_t N, size_t... Is>
|
||||
struct RangeBuilder : public RangeBuilder<MIN, N - 1, N - 1, Is...> {};
|
||||
#endif // EIGEN_PARSED_BY_DOXYGEN
|
||||
|
||||
/// \brief IndexRange that returns a [MIN, MAX) index range
|
||||
/// \tparam MIN is the starting index in the tuple
|
||||
/// \tparam MAX is the size of the tuple
|
||||
template <size_t MIN, size_t MAX>
|
||||
struct IndexRange: RangeBuilder<MIN, MAX>::type {};
|
||||
|
||||
/// append_base
|
||||
/// \brief unpacking the elements of the input tuple t and creating a new tuple
|
||||
/// by adding element a at the end of it.
|
||||
///\tparam Args... the type of the elements inside the tuple t
|
||||
/// \tparam T the type of the new element going to be added at the end of tuple
|
||||
/// \tparam I... is the list of index from [0 to sizeof...(t))
|
||||
/// \param t the tuple on which we want to append a.
|
||||
/// \param a the new elements going to be added to the tuple
|
||||
/// \return Tuple<Args..., T>
|
||||
template <typename... Args, typename T, size_t... I>
|
||||
Tuple<Args..., T> append_base(Tuple<Args...> t, T a,IndexList<I...>) {
|
||||
return utility::tuple::make_tuple(get<I>(t)..., a);
|
||||
}
|
||||
|
||||
/// append
|
||||
/// \brief the deduction function for \ref append_base that automatically
|
||||
/// generate the \ref IndexRange
|
||||
///\tparam Args... the type of the elements inside the tuple t
|
||||
/// \tparam T the type of the new element going to be added at the end of tuple
|
||||
/// \param t the tuple on which we want to append a.
|
||||
/// \param a the new elements going to be added to the tuple
|
||||
/// \return Tuple<Args..., T>
|
||||
template <typename... Args, typename T>
|
||||
Tuple<Args..., T> append(Tuple<Args...> t, T a) {
|
||||
return utility::tuple::append_base(t, a, IndexRange<0, sizeof...(Args)>());
|
||||
}
|
||||
|
||||
/// append_base
|
||||
/// \brief This is a specialisation of \ref append_base when we want to
|
||||
/// concatenate
|
||||
/// tuple t2 at the end of the tuple t1. Here we unpack both tuples, generate the
|
||||
/// IndexRange for each of them and create an output tuple T that contains both
|
||||
/// elements of t1 and t2.
|
||||
///\tparam Args1... the type of the elements inside the tuple t1
|
||||
///\tparam Args2... the type of the elements inside the tuple t2
|
||||
/// \tparam I1... is the list of index from [0 to sizeof...(t1))
|
||||
/// \tparam I2... is the list of index from [0 to sizeof...(t2))
|
||||
/// \param t1 is the tuple on which we want to append t2.
|
||||
/// \param t2 is the tuple that is going to be added on t1.
|
||||
/// \return Tuple<Args1..., Args2...>
|
||||
template <typename... Args1, typename... Args2, size_t... I1, size_t... I2>
|
||||
Tuple<Args1..., Args2...> append_base(Tuple<Args1...> t1, Tuple<Args2...> t2, IndexList<I1...>, IndexList<I2...>) {
|
||||
return utility::tuple::make_tuple(get<I1>(t1)...,get<I2>(t2)...);
|
||||
}
|
||||
|
||||
/// append
|
||||
/// \brief deduction function for \ref append_base when we are appending tuple
|
||||
/// t1 by tuple t2. In this case the \ref IndexRange for both tuple are
|
||||
/// automatically generated.
|
||||
///\tparam Args1... the type of the elements inside the tuple t1
|
||||
///\tparam Args2... the type of the elements inside the tuple t2
|
||||
/// \param t1 is the tuple on which we want to append t2.
|
||||
/// \param t2 is the tuple that is going to be added on t1.
|
||||
/// \return Tuple<Args1..., Args2...>
|
||||
template <typename... Args1, typename... Args2>
|
||||
Tuple<Args1..., Args2...> append(Tuple<Args1...> t1,Tuple<Args2...> t2) {
|
||||
return utility::tuple::append_base(t1, t2, IndexRange<0, sizeof...(Args1)>(), IndexRange<0, sizeof...(Args2)>());
|
||||
}
|
||||
} // tuple
|
||||
} // utility
|
||||
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
|
|
@ -0,0 +1,272 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
|
||||
template<typename Scalar, int Options>
|
||||
class compute_tensor_flags
|
||||
{
|
||||
enum {
|
||||
is_dynamic_size_storage = 1,
|
||||
|
||||
is_aligned =
|
||||
(
|
||||
((Options&DontAlign)==0) && (
|
||||
#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
|
||||
(!is_dynamic_size_storage)
|
||||
#else
|
||||
0
|
||||
#endif
|
||||
|
|
||||
#if EIGEN_MAX_ALIGN_BYTES>0
|
||||
is_dynamic_size_storage
|
||||
#else
|
||||
0
|
||||
#endif
|
||||
)
|
||||
),
|
||||
packet_access_bit = packet_traits<Scalar>::Vectorizable && is_aligned ? PacketAccessBit : 0
|
||||
};
|
||||
|
||||
public:
|
||||
enum { ret = packet_access_bit };
|
||||
};
|
||||
|
||||
|
||||
template<typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
|
||||
struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
|
||||
{
|
||||
typedef Scalar_ Scalar;
|
||||
typedef Dense StorageKind;
|
||||
typedef IndexType_ Index;
|
||||
static const int NumDimensions = NumIndices_;
|
||||
static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
|
||||
enum {
|
||||
Options = Options_,
|
||||
Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit)
|
||||
};
|
||||
template <typename T> struct MakePointer {
|
||||
typedef T* Type;
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template<typename Scalar_, typename Dimensions, int Options_, typename IndexType_>
|
||||
struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
|
||||
{
|
||||
typedef Scalar_ Scalar;
|
||||
typedef Dense StorageKind;
|
||||
typedef IndexType_ Index;
|
||||
static const int NumDimensions = array_size<Dimensions>::value;
|
||||
static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
|
||||
enum {
|
||||
Options = Options_,
|
||||
Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0: LvalueBit)
|
||||
};
|
||||
template <typename T> struct MakePointer {
|
||||
typedef T* Type;
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template<typename PlainObjectType, int Options_, template <class> class MakePointer_>
|
||||
struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
|
||||
: public traits<PlainObjectType>
|
||||
{
|
||||
typedef traits<PlainObjectType> BaseTraits;
|
||||
typedef typename BaseTraits::Scalar Scalar;
|
||||
typedef typename BaseTraits::StorageKind StorageKind;
|
||||
typedef typename BaseTraits::Index Index;
|
||||
static const int NumDimensions = BaseTraits::NumDimensions;
|
||||
static const int Layout = BaseTraits::Layout;
|
||||
enum {
|
||||
Options = Options_,
|
||||
Flags = BaseTraits::Flags
|
||||
};
|
||||
template <class T> struct MakePointer {
|
||||
// Intermediate typedef to workaround MSVC issue.
|
||||
typedef MakePointer_<T> MakePointerT;
|
||||
typedef typename MakePointerT::Type Type;
|
||||
};
|
||||
};
|
||||
|
||||
template<typename PlainObjectType>
|
||||
struct traits<TensorRef<PlainObjectType> >
|
||||
: public traits<PlainObjectType>
|
||||
{
|
||||
typedef traits<PlainObjectType> BaseTraits;
|
||||
typedef typename BaseTraits::Scalar Scalar;
|
||||
typedef typename BaseTraits::StorageKind StorageKind;
|
||||
typedef typename BaseTraits::Index Index;
|
||||
static const int NumDimensions = BaseTraits::NumDimensions;
|
||||
static const int Layout = BaseTraits::Layout;
|
||||
enum {
|
||||
Options = BaseTraits::Options,
|
||||
Flags = BaseTraits::Flags
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
|
||||
struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
|
||||
{
|
||||
typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
|
||||
};
|
||||
|
||||
template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
|
||||
struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
|
||||
{
|
||||
typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
|
||||
};
|
||||
|
||||
template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
|
||||
struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
|
||||
};
|
||||
|
||||
template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
|
||||
struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
|
||||
};
|
||||
|
||||
template<typename PlainObjectType, int Options, template <class> class MakePointer>
|
||||
struct eval<TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
|
||||
};
|
||||
|
||||
template<typename PlainObjectType, int Options, template <class> class MakePointer>
|
||||
struct eval<const TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
|
||||
};
|
||||
|
||||
template<typename PlainObjectType>
|
||||
struct eval<TensorRef<PlainObjectType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorRef<PlainObjectType>& type;
|
||||
};
|
||||
|
||||
template<typename PlainObjectType>
|
||||
struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorRef<PlainObjectType>& type;
|
||||
};
|
||||
|
||||
// TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector.
|
||||
template<typename T, int n=1, typename PlainObject = void> struct nested
|
||||
{
|
||||
typedef typename ref_selector<T>::type type;
|
||||
};
|
||||
|
||||
template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
|
||||
struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
|
||||
{
|
||||
typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
|
||||
};
|
||||
|
||||
template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
|
||||
struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
|
||||
{
|
||||
typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
|
||||
};
|
||||
|
||||
template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
|
||||
struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
|
||||
{
|
||||
typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
|
||||
};
|
||||
|
||||
template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
|
||||
struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
|
||||
{
|
||||
typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
|
||||
};
|
||||
|
||||
|
||||
template <typename PlainObjectType, int Options, template <class> class MakePointer>
|
||||
struct nested<TensorMap<PlainObjectType, Options, MakePointer> >
|
||||
{
|
||||
typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
|
||||
};
|
||||
|
||||
template <typename PlainObjectType, int Options, template <class> class MakePointer>
|
||||
struct nested<const TensorMap<PlainObjectType, Options, MakePointer> >
|
||||
{
|
||||
typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
|
||||
};
|
||||
|
||||
template <typename PlainObjectType>
|
||||
struct nested<TensorRef<PlainObjectType> >
|
||||
{
|
||||
typedef const TensorRef<PlainObjectType>& type;
|
||||
};
|
||||
|
||||
template <typename PlainObjectType>
|
||||
struct nested<const TensorRef<PlainObjectType> >
|
||||
{
|
||||
typedef const TensorRef<PlainObjectType>& type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C,
|
||||
// R, B), and convolve it with a set of filters, which can also be presented as
|
||||
// a tensor (D, K, K, M), where M is the number of filters, K is the filter
|
||||
// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For
|
||||
// simplicity we assume that we always use square filters (which is usually the
|
||||
// case in images), hence the two Ks in the tensor dimension. It also takes in
|
||||
// a few additional parameters:
|
||||
// Stride (S): The convolution stride is the offset between locations where we
|
||||
// apply the filters. A larger stride means that the output will be
|
||||
// spatially smaller.
|
||||
// Padding (P): The padding we apply to the input tensor along the R and C
|
||||
// dimensions. This is usually used to make sure that the spatial
|
||||
// dimensions of the output matches our intention.
|
||||
//
|
||||
// Two types of padding are often used:
|
||||
// SAME: The pad value is computed so that the output will have size
|
||||
// R/S and C/S.
|
||||
// VALID: no padding is carried out.
|
||||
// When we do padding, the padded values at the padded locations are usually
|
||||
// zero.
|
||||
//
|
||||
// The output dimensions for convolution, when given all the parameters above,
|
||||
// are as follows:
|
||||
// When Padding = SAME: the output size is (B, R', C', M), where
|
||||
// R' = ceil(float(R) / float(S))
|
||||
// C' = ceil(float(C) / float(S))
|
||||
// where ceil is the ceiling function. The input tensor is padded with 0 as
|
||||
// needed. The number of padded rows and columns are computed as:
|
||||
// Pr = ((R' - 1) * S + K - R) / 2
|
||||
// Pc = ((C' - 1) * S + K - C) / 2
|
||||
// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2.
|
||||
// This is where SAME comes from - the output has the same size as the input has.
|
||||
// When Padding = VALID: the output size is computed as
|
||||
// R' = ceil(float(R - K + 1) / float(S))
|
||||
// C' = ceil(float(C - K + 1) / float(S))
|
||||
// and the number of padded rows and columns are computed in the same way as in
|
||||
// the SAME case.
|
||||
// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0,
|
||||
// Pc=0.
|
||||
typedef enum {
|
||||
PADDING_VALID = 1,
|
||||
PADDING_SAME = 2
|
||||
} PaddingType;
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
|
|
@ -0,0 +1,248 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
|
||||
template <uint64_t n>
|
||||
struct static_val {
|
||||
static const uint64_t value = n;
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { }
|
||||
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
|
||||
eigen_assert(v == n);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename HIGH = uint64_t, typename LOW = uint64_t>
|
||||
struct TensorUInt128
|
||||
{
|
||||
HIGH high;
|
||||
LOW low;
|
||||
|
||||
template<typename OTHER_HIGH, typename OTHER_LOW>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
TensorUInt128(const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) : high(other.high), low(other.low) {
|
||||
EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
}
|
||||
|
||||
template<typename OTHER_HIGH, typename OTHER_LOW>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
TensorUInt128& operator = (const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) {
|
||||
EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
high = other.high;
|
||||
low = other.low;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
explicit TensorUInt128(const T& x) : high(0), low(x) {
|
||||
eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= NumTraits<uint64_t>::highest()));
|
||||
eigen_assert(x >= 0);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
TensorUInt128(HIGH y, LOW x) : high(y), low(x) { }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const {
|
||||
return low;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LOW lower() const {
|
||||
return low;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HIGH upper() const {
|
||||
return high;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename HL, typename LL, typename HR, typename LR>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||
{
|
||||
return (lhs.high == rhs.high) & (lhs.low == rhs.low);
|
||||
}
|
||||
|
||||
template <typename HL, typename LL, typename HR, typename LR>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||
{
|
||||
return (lhs.high != rhs.high) | (lhs.low != rhs.low);
|
||||
}
|
||||
|
||||
template <typename HL, typename LL, typename HR, typename LR>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||
{
|
||||
if (lhs.high != rhs.high) {
|
||||
return lhs.high > rhs.high;
|
||||
}
|
||||
return lhs.low >= rhs.low;
|
||||
}
|
||||
|
||||
template <typename HL, typename LL, typename HR, typename LR>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||
{
|
||||
if (lhs.high != rhs.high) {
|
||||
return lhs.high < rhs.high;
|
||||
}
|
||||
return lhs.low < rhs.low;
|
||||
}
|
||||
|
||||
template <typename HL, typename LL, typename HR, typename LR>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||
{
|
||||
TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low);
|
||||
if (result.low < rhs.low) {
|
||||
result.high += 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename HL, typename LL, typename HR, typename LR>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||
{
|
||||
TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low);
|
||||
if (result.low > lhs.low) {
|
||||
result.high -= 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
template <typename HL, typename LL, typename HR, typename LR>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||
{
|
||||
// Split each 128-bit integer into 4 32-bit integers, and then do the
|
||||
// multiplications by hand as follow:
|
||||
// lhs a b c d
|
||||
// rhs e f g h
|
||||
// -----------
|
||||
// ah bh ch dh
|
||||
// bg cg dg
|
||||
// cf df
|
||||
// de
|
||||
// The result is stored in 2 64bit integers, high and low.
|
||||
|
||||
const uint64_t LOW = 0x00000000FFFFFFFFLL;
|
||||
const uint64_t HIGH = 0xFFFFFFFF00000000LL;
|
||||
|
||||
uint64_t d = lhs.low & LOW;
|
||||
uint64_t c = (lhs.low & HIGH) >> 32LL;
|
||||
uint64_t b = lhs.high & LOW;
|
||||
uint64_t a = (lhs.high & HIGH) >> 32LL;
|
||||
|
||||
uint64_t h = rhs.low & LOW;
|
||||
uint64_t g = (rhs.low & HIGH) >> 32LL;
|
||||
uint64_t f = rhs.high & LOW;
|
||||
uint64_t e = (rhs.high & HIGH) >> 32LL;
|
||||
|
||||
// Compute the low 32 bits of low
|
||||
uint64_t acc = d * h;
|
||||
uint64_t low = acc & LOW;
|
||||
// Compute the high 32 bits of low. Add a carry every time we wrap around
|
||||
acc >>= 32LL;
|
||||
uint64_t carry = 0;
|
||||
uint64_t acc2 = acc + c * h;
|
||||
if (acc2 < acc) {
|
||||
carry++;
|
||||
}
|
||||
acc = acc2 + d * g;
|
||||
if (acc < acc2) {
|
||||
carry++;
|
||||
}
|
||||
low |= (acc << 32LL);
|
||||
|
||||
// Carry forward the high bits of acc to initiate the computation of the
|
||||
// low 32 bits of high
|
||||
acc2 = (acc >> 32LL) | (carry << 32LL);
|
||||
carry = 0;
|
||||
|
||||
acc = acc2 + b * h;
|
||||
if (acc < acc2) {
|
||||
carry++;
|
||||
}
|
||||
acc2 = acc + c * g;
|
||||
if (acc2 < acc) {
|
||||
carry++;
|
||||
}
|
||||
acc = acc2 + d * f;
|
||||
if (acc < acc2) {
|
||||
carry++;
|
||||
}
|
||||
uint64_t high = acc & LOW;
|
||||
|
||||
// Start to compute the high 32 bits of high.
|
||||
acc2 = (acc >> 32LL) | (carry << 32LL);
|
||||
|
||||
acc = acc2 + a * h;
|
||||
acc2 = acc + b * g;
|
||||
acc = acc2 + c * f;
|
||||
acc2 = acc + d * e;
|
||||
high |= (acc2 << 32LL);
|
||||
|
||||
return TensorUInt128<uint64_t, uint64_t>(high, low);
|
||||
}
|
||||
|
||||
template <typename HL, typename LL, typename HR, typename LR>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||
{
|
||||
if (rhs == TensorUInt128<static_val<0>, static_val<1> >(1)) {
|
||||
return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
|
||||
} else if (lhs < rhs) {
|
||||
return TensorUInt128<uint64_t, uint64_t>(0);
|
||||
} else {
|
||||
// calculate the biggest power of 2 times rhs that's less than or equal to lhs
|
||||
TensorUInt128<uint64_t, uint64_t> power2(1);
|
||||
TensorUInt128<uint64_t, uint64_t> d(rhs);
|
||||
TensorUInt128<uint64_t, uint64_t> tmp(lhs - d);
|
||||
while (lhs >= d) {
|
||||
tmp = tmp - d;
|
||||
d = d + d;
|
||||
power2 = power2 + power2;
|
||||
}
|
||||
|
||||
tmp = TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
|
||||
TensorUInt128<uint64_t, uint64_t> result(0);
|
||||
while (power2 != TensorUInt128<static_val<0>, static_val<0> >(0)) {
|
||||
if (tmp >= d) {
|
||||
tmp = tmp - d;
|
||||
result = result + power2;
|
||||
}
|
||||
// Shift right
|
||||
power2 = TensorUInt128<uint64_t, uint64_t>(power2.high >> 1, (power2.low >> 1) | (power2.high << 63));
|
||||
d = TensorUInt128<uint64_t, uint64_t>(d.high >> 1, (d.low >> 1) | (d.high << 63));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
|
|
@ -0,0 +1,608 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorVolumePatch
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Patch extraction specialized for processing of volumetric data.
|
||||
* This assumes that the input has a least 4 dimensions ordered as follows:
|
||||
* - channels
|
||||
* - planes
|
||||
* - rows
|
||||
* - columns
|
||||
* - (optional) additional dimensions such as time or batch size.
|
||||
* Calling the volume patch code with patch_planes, patch_rows, and patch_cols
|
||||
* is equivalent to calling the regular patch extraction code with parameters
|
||||
* d, patch_planes, patch_rows, patch_cols, and 1 for all the additional
|
||||
* dimensions.
|
||||
*/
|
||||
namespace internal {
|
||||
template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
|
||||
struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType>
|
||||
{
|
||||
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
|
||||
typedef traits<XprType> XprTraits;
|
||||
typedef typename XprTraits::StorageKind StorageKind;
|
||||
typedef typename XprTraits::Index Index;
|
||||
typedef typename XprType::Nested Nested;
|
||||
typedef typename remove_reference<Nested>::type _Nested;
|
||||
static const int NumDimensions = XprTraits::NumDimensions + 1;
|
||||
static const int Layout = XprTraits::Layout;
|
||||
};
|
||||
|
||||
template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
|
||||
struct eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorVolumePatchOp<Planes, Rows, Cols, XprType>& type;
|
||||
};
|
||||
|
||||
template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
|
||||
struct nested<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, 1, typename eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType> >::type>
|
||||
{
|
||||
typedef TensorVolumePatchOp<Planes, Rows, Cols, XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
|
||||
class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorVolumePatchOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorVolumePatchOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
|
||||
DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
|
||||
DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
|
||||
DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
|
||||
PaddingType padding_type, Scalar padding_value)
|
||||
: m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
|
||||
m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
|
||||
m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
|
||||
m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
|
||||
m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
|
||||
m_padding_type(padding_type), m_padding_value(padding_value) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
|
||||
DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
|
||||
DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
|
||||
DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
|
||||
DenseIndex padding_top_z, DenseIndex padding_bottom_z,
|
||||
DenseIndex padding_top, DenseIndex padding_bottom,
|
||||
DenseIndex padding_left, DenseIndex padding_right,
|
||||
Scalar padding_value)
|
||||
: m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
|
||||
m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
|
||||
m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
|
||||
m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
|
||||
m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
|
||||
m_padding_left(padding_left), m_padding_right(padding_right),
|
||||
m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex patch_planes() const { return m_patch_planes; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex patch_rows() const { return m_patch_rows; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex patch_cols() const { return m_patch_cols; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex plane_strides() const { return m_plane_strides; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex row_strides() const { return m_row_strides; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex col_strides() const { return m_col_strides; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex in_plane_strides() const { return m_in_plane_strides; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex in_row_strides() const { return m_in_row_strides; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex in_col_strides() const { return m_in_col_strides; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex plane_inflate_strides() const { return m_plane_inflate_strides; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
bool padding_explicit() const { return m_padding_explicit; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex padding_top_z() const { return m_padding_top_z; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex padding_bottom_z() const { return m_padding_bottom_z; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex padding_top() const { return m_padding_top; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex padding_bottom() const { return m_padding_bottom; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex padding_left() const { return m_padding_left; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseIndex padding_right() const { return m_padding_right; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
PaddingType padding_type() const { return m_padding_type; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
Scalar padding_value() const { return m_padding_value; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const DenseIndex m_patch_planes;
|
||||
const DenseIndex m_patch_rows;
|
||||
const DenseIndex m_patch_cols;
|
||||
const DenseIndex m_plane_strides;
|
||||
const DenseIndex m_row_strides;
|
||||
const DenseIndex m_col_strides;
|
||||
const DenseIndex m_in_plane_strides;
|
||||
const DenseIndex m_in_row_strides;
|
||||
const DenseIndex m_in_col_strides;
|
||||
const DenseIndex m_plane_inflate_strides;
|
||||
const DenseIndex m_row_inflate_strides;
|
||||
const DenseIndex m_col_inflate_strides;
|
||||
const bool m_padding_explicit;
|
||||
const DenseIndex m_padding_top_z;
|
||||
const DenseIndex m_padding_bottom_z;
|
||||
const DenseIndex m_padding_top;
|
||||
const DenseIndex m_padding_bottom;
|
||||
const DenseIndex m_padding_left;
|
||||
const DenseIndex m_padding_right;
|
||||
const PaddingType m_padding_type;
|
||||
const Scalar m_padding_value;
|
||||
};
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, Device>
|
||||
{
|
||||
typedef TensorVolumePatchOp<Planes, Rows, Cols, ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
static const int NumDims = NumInputDims + 1;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
|
||||
m_paddingValue = op.padding_value();
|
||||
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
|
||||
// Cache a few variables.
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_inputDepth = input_dims[0];
|
||||
m_inputPlanes = input_dims[1];
|
||||
m_inputRows = input_dims[2];
|
||||
m_inputCols = input_dims[3];
|
||||
} else {
|
||||
m_inputDepth = input_dims[NumInputDims-1];
|
||||
m_inputPlanes = input_dims[NumInputDims-2];
|
||||
m_inputRows = input_dims[NumInputDims-3];
|
||||
m_inputCols = input_dims[NumInputDims-4];
|
||||
}
|
||||
|
||||
m_plane_strides = op.plane_strides();
|
||||
m_row_strides = op.row_strides();
|
||||
m_col_strides = op.col_strides();
|
||||
|
||||
// Input strides and effective input/patch size
|
||||
m_in_plane_strides = op.in_plane_strides();
|
||||
m_in_row_strides = op.in_row_strides();
|
||||
m_in_col_strides = op.in_col_strides();
|
||||
m_plane_inflate_strides = op.plane_inflate_strides();
|
||||
m_row_inflate_strides = op.row_inflate_strides();
|
||||
m_col_inflate_strides = op.col_inflate_strides();
|
||||
|
||||
// The "effective" spatial size after inflating data with zeros.
|
||||
m_input_planes_eff = (m_inputPlanes - 1) * m_plane_inflate_strides + 1;
|
||||
m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
|
||||
m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
|
||||
m_patch_planes_eff = op.patch_planes() + (op.patch_planes() - 1) * (m_in_plane_strides - 1);
|
||||
m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
|
||||
m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
|
||||
|
||||
if (op.padding_explicit()) {
|
||||
m_outputPlanes = numext::ceil((m_input_planes_eff + op.padding_top_z() + op.padding_bottom_z() - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
|
||||
m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
|
||||
m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
|
||||
m_planePaddingTop = op.padding_top_z();
|
||||
m_rowPaddingTop = op.padding_top();
|
||||
m_colPaddingLeft = op.padding_left();
|
||||
} else {
|
||||
// Computing padding from the type
|
||||
switch (op.padding_type()) {
|
||||
case PADDING_VALID:
|
||||
m_outputPlanes = numext::ceil((m_input_planes_eff - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
|
||||
m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
|
||||
m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
|
||||
m_planePaddingTop = 0;
|
||||
m_rowPaddingTop = 0;
|
||||
m_colPaddingLeft = 0;
|
||||
break;
|
||||
case PADDING_SAME: {
|
||||
m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast<float>(m_plane_strides));
|
||||
m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
|
||||
m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
|
||||
const Index dz = m_outputPlanes * m_plane_strides + m_patch_planes_eff - 1 - m_input_planes_eff;
|
||||
const Index dy = m_outputRows * m_row_strides + m_patch_rows_eff - 1 - m_input_rows_eff;
|
||||
const Index dx = m_outputCols * m_col_strides + m_patch_cols_eff - 1 - m_input_cols_eff;
|
||||
m_planePaddingTop = dz - dz / 2;
|
||||
m_rowPaddingTop = dy - dy / 2;
|
||||
m_colPaddingLeft = dx - dx / 2;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
eigen_assert(false && "unexpected padding");
|
||||
}
|
||||
}
|
||||
eigen_assert(m_outputRows > 0);
|
||||
eigen_assert(m_outputCols > 0);
|
||||
eigen_assert(m_outputPlanes > 0);
|
||||
|
||||
// Dimensions for result of extraction.
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
// ColMajor
|
||||
// 0: depth
|
||||
// 1: patch_planes
|
||||
// 2: patch_rows
|
||||
// 3: patch_cols
|
||||
// 4: number of patches
|
||||
// 5 and beyond: anything else (such as batch).
|
||||
m_dimensions[0] = input_dims[0];
|
||||
m_dimensions[1] = op.patch_planes();
|
||||
m_dimensions[2] = op.patch_rows();
|
||||
m_dimensions[3] = op.patch_cols();
|
||||
m_dimensions[4] = m_outputPlanes * m_outputRows * m_outputCols;
|
||||
for (int i = 5; i < NumDims; ++i) {
|
||||
m_dimensions[i] = input_dims[i-1];
|
||||
}
|
||||
} else {
|
||||
// RowMajor
|
||||
// NumDims-1: depth
|
||||
// NumDims-2: patch_planes
|
||||
// NumDims-3: patch_rows
|
||||
// NumDims-4: patch_cols
|
||||
// NumDims-5: number of patches
|
||||
// NumDims-6 and beyond: anything else (such as batch).
|
||||
m_dimensions[NumDims-1] = input_dims[NumInputDims-1];
|
||||
m_dimensions[NumDims-2] = op.patch_planes();
|
||||
m_dimensions[NumDims-3] = op.patch_rows();
|
||||
m_dimensions[NumDims-4] = op.patch_cols();
|
||||
m_dimensions[NumDims-5] = m_outputPlanes * m_outputRows * m_outputCols;
|
||||
for (int i = NumDims-6; i >= 0; --i) {
|
||||
m_dimensions[i] = input_dims[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Strides for the output tensor.
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_rowStride = m_dimensions[1];
|
||||
m_colStride = m_dimensions[2] * m_rowStride;
|
||||
m_patchStride = m_colStride * m_dimensions[3] * m_dimensions[0];
|
||||
m_otherStride = m_patchStride * m_dimensions[4];
|
||||
} else {
|
||||
m_rowStride = m_dimensions[NumDims-2];
|
||||
m_colStride = m_dimensions[NumDims-3] * m_rowStride;
|
||||
m_patchStride = m_colStride * m_dimensions[NumDims-4] * m_dimensions[NumDims-1];
|
||||
m_otherStride = m_patchStride * m_dimensions[NumDims-5];
|
||||
}
|
||||
|
||||
// Strides for navigating through the input tensor.
|
||||
m_planeInputStride = m_inputDepth;
|
||||
m_rowInputStride = m_inputDepth * m_inputPlanes;
|
||||
m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes;
|
||||
m_otherInputStride = m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes;
|
||||
|
||||
m_outputPlanesRows = m_outputPlanes * m_outputRows;
|
||||
|
||||
// Fast representations of different variables.
|
||||
m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
|
||||
m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
|
||||
m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
|
||||
m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
|
||||
m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
|
||||
m_fastInputColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
|
||||
m_fastInputPlaneStride = internal::TensorIntDivisor<Index>(m_plane_inflate_strides);
|
||||
m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
|
||||
m_fastOutputPlanes = internal::TensorIntDivisor<Index>(m_outputPlanes);
|
||||
m_fastOutputPlanesRows = internal::TensorIntDivisor<Index>(m_outputPlanesRows);
|
||||
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
|
||||
} else {
|
||||
m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
return true;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
// Patch index corresponding to the passed in index.
|
||||
const Index patchIndex = index / m_fastPatchStride;
|
||||
|
||||
// Spatial offset within the patch. This has to be translated into 3D
|
||||
// coordinates within the patch.
|
||||
const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
|
||||
|
||||
// Batch, etc.
|
||||
const Index otherIndex = (NumDims == 5) ? 0 : index / m_fastOtherStride;
|
||||
const Index patch3DIndex = (NumDims == 5) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
|
||||
|
||||
// Calculate column index in the input original tensor.
|
||||
const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
|
||||
const Index colOffset = patchOffset / m_fastColStride;
|
||||
const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
|
||||
const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
|
||||
if (inputCol < 0 || inputCol >= m_input_cols_eff ||
|
||||
((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
|
||||
return Scalar(m_paddingValue);
|
||||
}
|
||||
|
||||
// Calculate row index in the original input tensor.
|
||||
const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
|
||||
const Index rowOffset = (patchOffset - colOffset * m_colStride) / m_fastRowStride;
|
||||
const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
|
||||
const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
|
||||
if (inputRow < 0 || inputRow >= m_input_rows_eff ||
|
||||
((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
|
||||
return Scalar(m_paddingValue);
|
||||
}
|
||||
|
||||
// Calculate plane index in the original input tensor.
|
||||
const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
|
||||
const Index planeOffset = patchOffset - colOffset * m_colStride - rowOffset * m_rowStride;
|
||||
const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop;
|
||||
const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
|
||||
if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
|
||||
((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) {
|
||||
return Scalar(m_paddingValue);
|
||||
}
|
||||
|
||||
const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
|
||||
const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
|
||||
|
||||
const Index inputIndex = depth +
|
||||
origInputRow * m_rowInputStride +
|
||||
origInputCol * m_colInputStride +
|
||||
origInputPlane * m_planeInputStride +
|
||||
otherIndex * m_otherInputStride;
|
||||
|
||||
return m_impl.coeff(inputIndex);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
|
||||
m_in_plane_strides != 1 || m_plane_inflate_strides != 1) {
|
||||
return packetWithPossibleZero(index);
|
||||
}
|
||||
|
||||
const Index indices[2] = {index, index + PacketSize - 1};
|
||||
const Index patchIndex = indices[0] / m_fastPatchStride;
|
||||
if (patchIndex != indices[1] / m_fastPatchStride) {
|
||||
return packetWithPossibleZero(index);
|
||||
}
|
||||
const Index otherIndex = (NumDims == 5) ? 0 : indices[0] / m_fastOtherStride;
|
||||
eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
|
||||
|
||||
// Find the offset of the element wrt the location of the first element.
|
||||
const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
|
||||
(indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
|
||||
|
||||
const Index patch3DIndex = (NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
|
||||
eigen_assert(patch3DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
|
||||
|
||||
const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
|
||||
const Index colOffsets[2] = {
|
||||
patchOffsets[0] / m_fastColStride,
|
||||
patchOffsets[1] / m_fastColStride};
|
||||
|
||||
// Calculate col indices in the original input tensor.
|
||||
const Index inputCols[2] = {
|
||||
colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft,
|
||||
colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
|
||||
if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
|
||||
return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
|
||||
}
|
||||
|
||||
if (inputCols[0] != inputCols[1]) {
|
||||
return packetWithPossibleZero(index);
|
||||
}
|
||||
|
||||
const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
|
||||
const Index rowOffsets[2] = {
|
||||
(patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
|
||||
(patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
|
||||
eigen_assert(rowOffsets[0] <= rowOffsets[1]);
|
||||
// Calculate col indices in the original input tensor.
|
||||
const Index inputRows[2] = {
|
||||
rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop,
|
||||
rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
|
||||
|
||||
if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
|
||||
return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
|
||||
}
|
||||
|
||||
if (inputRows[0] != inputRows[1]) {
|
||||
return packetWithPossibleZero(index);
|
||||
}
|
||||
|
||||
const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
|
||||
const Index planeOffsets[2] = {
|
||||
patchOffsets[0] - colOffsets[0] * m_colStride - rowOffsets[0] * m_rowStride,
|
||||
patchOffsets[1] - colOffsets[1] * m_colStride - rowOffsets[1] * m_rowStride};
|
||||
eigen_assert(planeOffsets[0] <= planeOffsets[1]);
|
||||
const Index inputPlanes[2] = {
|
||||
planeIndex * m_plane_strides + planeOffsets[0] - m_planePaddingTop,
|
||||
planeIndex * m_plane_strides + planeOffsets[1] - m_planePaddingTop};
|
||||
|
||||
if (inputPlanes[1] < 0 || inputPlanes[0] >= m_inputPlanes) {
|
||||
return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
|
||||
}
|
||||
|
||||
if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
|
||||
// no padding
|
||||
const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
|
||||
const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
|
||||
const Index inputIndex = depth +
|
||||
inputRows[0] * m_rowInputStride +
|
||||
inputCols[0] * m_colInputStride +
|
||||
m_planeInputStride * inputPlanes[0] +
|
||||
otherIndex * m_otherInputStride;
|
||||
return m_impl.template packet<Unaligned>(inputIndex);
|
||||
}
|
||||
|
||||
return packetWithPossibleZero(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost =
|
||||
10 * TensorOpCost::DivCost<Index>() + 21 * TensorOpCost::MulCost<Index>() +
|
||||
8 * TensorOpCost::AddCost<Index>();
|
||||
return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
|
||||
Index planePaddingTop() const { return m_planePaddingTop; }
|
||||
Index rowPaddingTop() const { return m_rowPaddingTop; }
|
||||
Index colPaddingLeft() const { return m_colPaddingLeft; }
|
||||
Index outputPlanes() const { return m_outputPlanes; }
|
||||
Index outputRows() const { return m_outputRows; }
|
||||
Index outputCols() const { return m_outputCols; }
|
||||
Index userPlaneStride() const { return m_plane_strides; }
|
||||
Index userRowStride() const { return m_row_strides; }
|
||||
Index userColStride() const { return m_col_strides; }
|
||||
Index userInPlaneStride() const { return m_in_plane_strides; }
|
||||
Index userInRowStride() const { return m_in_row_strides; }
|
||||
Index userInColStride() const { return m_in_col_strides; }
|
||||
Index planeInflateStride() const { return m_plane_inflate_strides; }
|
||||
Index rowInflateStride() const { return m_row_inflate_strides; }
|
||||
Index colInflateStride() const { return m_col_inflate_strides; }
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
||||
{
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
|
||||
Dimensions m_dimensions;
|
||||
|
||||
// Parameters passed to the costructor.
|
||||
Index m_plane_strides;
|
||||
Index m_row_strides;
|
||||
Index m_col_strides;
|
||||
|
||||
Index m_outputPlanes;
|
||||
Index m_outputRows;
|
||||
Index m_outputCols;
|
||||
|
||||
Index m_planePaddingTop;
|
||||
Index m_rowPaddingTop;
|
||||
Index m_colPaddingLeft;
|
||||
|
||||
Index m_in_plane_strides;
|
||||
Index m_in_row_strides;
|
||||
Index m_in_col_strides;
|
||||
|
||||
Index m_plane_inflate_strides;
|
||||
Index m_row_inflate_strides;
|
||||
Index m_col_inflate_strides;
|
||||
|
||||
// Cached input size.
|
||||
Index m_inputDepth;
|
||||
Index m_inputPlanes;
|
||||
Index m_inputRows;
|
||||
Index m_inputCols;
|
||||
|
||||
// Other cached variables.
|
||||
Index m_outputPlanesRows;
|
||||
|
||||
// Effective input/patch post-inflation size.
|
||||
Index m_input_planes_eff;
|
||||
Index m_input_rows_eff;
|
||||
Index m_input_cols_eff;
|
||||
Index m_patch_planes_eff;
|
||||
Index m_patch_rows_eff;
|
||||
Index m_patch_cols_eff;
|
||||
|
||||
// Strides for the output tensor.
|
||||
Index m_otherStride;
|
||||
Index m_patchStride;
|
||||
Index m_rowStride;
|
||||
Index m_colStride;
|
||||
|
||||
// Strides for the input tensor.
|
||||
Index m_planeInputStride;
|
||||
Index m_rowInputStride;
|
||||
Index m_colInputStride;
|
||||
Index m_otherInputStride;
|
||||
|
||||
internal::TensorIntDivisor<Index> m_fastOtherStride;
|
||||
internal::TensorIntDivisor<Index> m_fastPatchStride;
|
||||
internal::TensorIntDivisor<Index> m_fastColStride;
|
||||
internal::TensorIntDivisor<Index> m_fastRowStride;
|
||||
internal::TensorIntDivisor<Index> m_fastInputPlaneStride;
|
||||
internal::TensorIntDivisor<Index> m_fastInputRowStride;
|
||||
internal::TensorIntDivisor<Index> m_fastInputColStride;
|
||||
internal::TensorIntDivisor<Index> m_fastInputColsEff;
|
||||
internal::TensorIntDivisor<Index> m_fastOutputPlanesRows;
|
||||
internal::TensorIntDivisor<Index> m_fastOutputPlanes;
|
||||
internal::TensorIntDivisor<Index> m_fastOutputDepth;
|
||||
|
||||
Scalar m_paddingValue;
|
||||
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
};
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
|
|
@ -0,0 +1,293 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
|
||||
#define EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
class DynamicSGroup
|
||||
{
|
||||
public:
|
||||
inline explicit DynamicSGroup() : m_numIndices(1), m_elements(), m_generators(), m_globalFlags(0) { m_elements.push_back(ge(Generator(0, 0, 0))); }
|
||||
inline DynamicSGroup(const DynamicSGroup& o) : m_numIndices(o.m_numIndices), m_elements(o.m_elements), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { }
|
||||
inline DynamicSGroup(DynamicSGroup&& o) : m_numIndices(o.m_numIndices), m_elements(), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { std::swap(m_elements, o.m_elements); }
|
||||
inline DynamicSGroup& operator=(const DynamicSGroup& o) { m_numIndices = o.m_numIndices; m_elements = o.m_elements; m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; }
|
||||
inline DynamicSGroup& operator=(DynamicSGroup&& o) { m_numIndices = o.m_numIndices; std::swap(m_elements, o.m_elements); m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; }
|
||||
|
||||
void add(int one, int two, int flags = 0);
|
||||
|
||||
template<typename Gen_>
|
||||
inline void add(Gen_) { add(Gen_::One, Gen_::Two, Gen_::Flags); }
|
||||
inline void addSymmetry(int one, int two) { add(one, two, 0); }
|
||||
inline void addAntiSymmetry(int one, int two) { add(one, two, NegationFlag); }
|
||||
inline void addHermiticity(int one, int two) { add(one, two, ConjugationFlag); }
|
||||
inline void addAntiHermiticity(int one, int two) { add(one, two, NegationFlag | ConjugationFlag); }
|
||||
|
||||
template<typename Op, typename RV, typename Index, std::size_t N, typename... Args>
|
||||
inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) const
|
||||
{
|
||||
eigen_assert(N >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
|
||||
for (std::size_t i = 0; i < size(); i++)
|
||||
initial = Op::run(h_permute(i, idx, typename internal::gen_numeric_list<int, N>::type()), m_elements[i].flags, initial, std::forward<Args>(args)...);
|
||||
return initial;
|
||||
}
|
||||
|
||||
template<typename Op, typename RV, typename Index, typename... Args>
|
||||
inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args) const
|
||||
{
|
||||
eigen_assert(idx.size() >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
|
||||
for (std::size_t i = 0; i < size(); i++)
|
||||
initial = Op::run(h_permute(i, idx), m_elements[i].flags, initial, std::forward<Args>(args)...);
|
||||
return initial;
|
||||
}
|
||||
|
||||
inline int globalFlags() const { return m_globalFlags; }
|
||||
inline std::size_t size() const { return m_elements.size(); }
|
||||
|
||||
template<typename Tensor_, typename... IndexTypes>
|
||||
inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
|
||||
{
|
||||
static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
|
||||
return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
|
||||
}
|
||||
|
||||
template<typename Tensor_>
|
||||
inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const
|
||||
{
|
||||
return internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup>(tensor, *this, indices);
|
||||
}
|
||||
private:
|
||||
struct GroupElement {
|
||||
std::vector<int> representation;
|
||||
int flags;
|
||||
bool isId() const
|
||||
{
|
||||
for (std::size_t i = 0; i < representation.size(); i++)
|
||||
if (i != (size_t)representation[i])
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
};
|
||||
struct Generator {
|
||||
int one;
|
||||
int two;
|
||||
int flags;
|
||||
constexpr inline Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {}
|
||||
};
|
||||
|
||||
std::size_t m_numIndices;
|
||||
std::vector<GroupElement> m_elements;
|
||||
std::vector<Generator> m_generators;
|
||||
int m_globalFlags;
|
||||
|
||||
template<typename Index, std::size_t N, int... n>
|
||||
inline std::array<Index, N> h_permute(std::size_t which, const std::array<Index, N>& idx, internal::numeric_list<int, n...>) const
|
||||
{
|
||||
return std::array<Index, N>{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }};
|
||||
}
|
||||
|
||||
template<typename Index>
|
||||
inline std::vector<Index> h_permute(std::size_t which, std::vector<Index> idx) const
|
||||
{
|
||||
std::vector<Index> result;
|
||||
result.reserve(idx.size());
|
||||
for (auto k : m_elements[which].representation)
|
||||
result.push_back(idx[k]);
|
||||
for (std::size_t i = m_numIndices; i < idx.size(); i++)
|
||||
result.push_back(idx[i]);
|
||||
return result;
|
||||
}
|
||||
|
||||
inline GroupElement ge(Generator const& g) const
|
||||
{
|
||||
GroupElement result;
|
||||
result.representation.reserve(m_numIndices);
|
||||
result.flags = g.flags;
|
||||
for (std::size_t k = 0; k < m_numIndices; k++) {
|
||||
if (k == (std::size_t)g.one)
|
||||
result.representation.push_back(g.two);
|
||||
else if (k == (std::size_t)g.two)
|
||||
result.representation.push_back(g.one);
|
||||
else
|
||||
result.representation.push_back(int(k));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
GroupElement mul(GroupElement, GroupElement) const;
|
||||
inline GroupElement mul(Generator g1, GroupElement g2) const
|
||||
{
|
||||
return mul(ge(g1), g2);
|
||||
}
|
||||
|
||||
inline GroupElement mul(GroupElement g1, Generator g2) const
|
||||
{
|
||||
return mul(g1, ge(g2));
|
||||
}
|
||||
|
||||
inline GroupElement mul(Generator g1, Generator g2) const
|
||||
{
|
||||
return mul(ge(g1), ge(g2));
|
||||
}
|
||||
|
||||
inline int findElement(GroupElement e) const
|
||||
{
|
||||
for (auto ee : m_elements) {
|
||||
if (ee.representation == e.representation)
|
||||
return ee.flags ^ e.flags;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
void updateGlobalFlags(int flagDiffOfSameGenerator);
|
||||
};
|
||||
|
||||
// dynamic symmetry group that auto-adds the template parameters in the constructor
|
||||
template<typename... Gen>
|
||||
class DynamicSGroupFromTemplateArgs : public DynamicSGroup
|
||||
{
|
||||
public:
|
||||
inline DynamicSGroupFromTemplateArgs() : DynamicSGroup()
|
||||
{
|
||||
add_all(internal::type_list<Gen...>());
|
||||
}
|
||||
inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs const& other) : DynamicSGroup(other) { }
|
||||
inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs&& other) : DynamicSGroup(other) { }
|
||||
inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(const DynamicSGroupFromTemplateArgs<Gen...>& o) { DynamicSGroup::operator=(o); return *this; }
|
||||
inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(DynamicSGroupFromTemplateArgs<Gen...>&& o) { DynamicSGroup::operator=(o); return *this; }
|
||||
|
||||
private:
|
||||
template<typename Gen1, typename... GenNext>
|
||||
inline void add_all(internal::type_list<Gen1, GenNext...>)
|
||||
{
|
||||
add(Gen1());
|
||||
add_all(internal::type_list<GenNext...>());
|
||||
}
|
||||
|
||||
inline void add_all(internal::type_list<>)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
inline DynamicSGroup::GroupElement DynamicSGroup::mul(GroupElement g1, GroupElement g2) const
|
||||
{
|
||||
eigen_internal_assert(g1.representation.size() == m_numIndices);
|
||||
eigen_internal_assert(g2.representation.size() == m_numIndices);
|
||||
|
||||
GroupElement result;
|
||||
result.representation.reserve(m_numIndices);
|
||||
for (std::size_t i = 0; i < m_numIndices; i++) {
|
||||
int v = g2.representation[g1.representation[i]];
|
||||
eigen_assert(v >= 0);
|
||||
result.representation.push_back(v);
|
||||
}
|
||||
result.flags = g1.flags ^ g2.flags;
|
||||
return result;
|
||||
}
|
||||
|
||||
inline void DynamicSGroup::add(int one, int two, int flags)
|
||||
{
|
||||
eigen_assert(one >= 0);
|
||||
eigen_assert(two >= 0);
|
||||
eigen_assert(one != two);
|
||||
|
||||
if ((std::size_t)one >= m_numIndices || (std::size_t)two >= m_numIndices) {
|
||||
std::size_t newNumIndices = (one > two) ? one : two + 1;
|
||||
for (auto& gelem : m_elements) {
|
||||
gelem.representation.reserve(newNumIndices);
|
||||
for (std::size_t i = m_numIndices; i < newNumIndices; i++)
|
||||
gelem.representation.push_back(i);
|
||||
}
|
||||
m_numIndices = newNumIndices;
|
||||
}
|
||||
|
||||
Generator g{one, two, flags};
|
||||
GroupElement e = ge(g);
|
||||
|
||||
/* special case for first generator */
|
||||
if (m_elements.size() == 1) {
|
||||
while (!e.isId()) {
|
||||
m_elements.push_back(e);
|
||||
e = mul(e, g);
|
||||
}
|
||||
|
||||
if (e.flags > 0)
|
||||
updateGlobalFlags(e.flags);
|
||||
|
||||
// only add in case we didn't have identity
|
||||
if (m_elements.size() > 1)
|
||||
m_generators.push_back(g);
|
||||
return;
|
||||
}
|
||||
|
||||
int p = findElement(e);
|
||||
if (p >= 0) {
|
||||
updateGlobalFlags(p);
|
||||
return;
|
||||
}
|
||||
|
||||
std::size_t coset_order = m_elements.size();
|
||||
m_elements.push_back(e);
|
||||
for (std::size_t i = 1; i < coset_order; i++)
|
||||
m_elements.push_back(mul(m_elements[i], e));
|
||||
m_generators.push_back(g);
|
||||
|
||||
std::size_t coset_rep = coset_order;
|
||||
do {
|
||||
for (auto g : m_generators) {
|
||||
e = mul(m_elements[coset_rep], g);
|
||||
p = findElement(e);
|
||||
if (p < 0) {
|
||||
// element not yet in group
|
||||
m_elements.push_back(e);
|
||||
for (std::size_t i = 1; i < coset_order; i++)
|
||||
m_elements.push_back(mul(m_elements[i], e));
|
||||
} else if (p > 0) {
|
||||
updateGlobalFlags(p);
|
||||
}
|
||||
}
|
||||
coset_rep += coset_order;
|
||||
} while (coset_rep < m_elements.size());
|
||||
}
|
||||
|
||||
inline void DynamicSGroup::updateGlobalFlags(int flagDiffOfSameGenerator)
|
||||
{
|
||||
switch (flagDiffOfSameGenerator) {
|
||||
case 0:
|
||||
default:
|
||||
// nothing happened
|
||||
break;
|
||||
case NegationFlag:
|
||||
// every element is it's own negative => whole tensor is zero
|
||||
m_globalFlags |= GlobalZeroFlag;
|
||||
break;
|
||||
case ConjugationFlag:
|
||||
// every element is it's own conjugate => whole tensor is real
|
||||
m_globalFlags |= GlobalRealFlag;
|
||||
break;
|
||||
case (NegationFlag | ConjugationFlag):
|
||||
// every element is it's own negative conjugate => whole tensor is imaginary
|
||||
m_globalFlags |= GlobalImagFlag;
|
||||
break;
|
||||
/* NOTE:
|
||||
* since GlobalZeroFlag == GlobalRealFlag | GlobalImagFlag, if one generator
|
||||
* causes the tensor to be real and the next one to be imaginary, this will
|
||||
* trivially give the correct result
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
|
||||
|
||||
/*
|
||||
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
|
||||
*/
|
|
@ -0,0 +1,236 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
|
||||
#define EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename list> struct tensor_static_symgroup_permutate;
|
||||
|
||||
template<int... nn>
|
||||
struct tensor_static_symgroup_permutate<numeric_list<int, nn...>>
|
||||
{
|
||||
constexpr static std::size_t N = sizeof...(nn);
|
||||
|
||||
template<typename T>
|
||||
constexpr static inline std::array<T, N> run(const std::array<T, N>& indices)
|
||||
{
|
||||
return {{indices[nn]...}};
|
||||
}
|
||||
};
|
||||
|
||||
template<typename indices_, int flags_>
|
||||
struct tensor_static_symgroup_element
|
||||
{
|
||||
typedef indices_ indices;
|
||||
constexpr static int flags = flags_;
|
||||
};
|
||||
|
||||
template<typename Gen, int N>
|
||||
struct tensor_static_symgroup_element_ctor
|
||||
{
|
||||
typedef tensor_static_symgroup_element<
|
||||
typename gen_numeric_list_swapped_pair<int, N, Gen::One, Gen::Two>::type,
|
||||
Gen::Flags
|
||||
> type;
|
||||
};
|
||||
|
||||
template<int N>
|
||||
struct tensor_static_symgroup_identity_ctor
|
||||
{
|
||||
typedef tensor_static_symgroup_element<
|
||||
typename gen_numeric_list<int, N>::type,
|
||||
0
|
||||
> type;
|
||||
};
|
||||
|
||||
template<typename iib>
|
||||
struct tensor_static_symgroup_multiply_helper
|
||||
{
|
||||
template<int... iia>
|
||||
constexpr static inline numeric_list<int, get<iia, iib>::value...> helper(numeric_list<int, iia...>) {
|
||||
return numeric_list<int, get<iia, iib>::value...>();
|
||||
}
|
||||
};
|
||||
|
||||
template<typename A, typename B>
|
||||
struct tensor_static_symgroup_multiply
|
||||
{
|
||||
private:
|
||||
typedef typename A::indices iia;
|
||||
typedef typename B::indices iib;
|
||||
constexpr static int ffa = A::flags;
|
||||
constexpr static int ffb = B::flags;
|
||||
|
||||
public:
|
||||
static_assert(iia::count == iib::count, "Cannot multiply symmetry elements with different number of indices.");
|
||||
|
||||
typedef tensor_static_symgroup_element<
|
||||
decltype(tensor_static_symgroup_multiply_helper<iib>::helper(iia())),
|
||||
ffa ^ ffb
|
||||
> type;
|
||||
};
|
||||
|
||||
template<typename A, typename B>
|
||||
struct tensor_static_symgroup_equality
|
||||
{
|
||||
typedef typename A::indices iia;
|
||||
typedef typename B::indices iib;
|
||||
constexpr static int ffa = A::flags;
|
||||
constexpr static int ffb = B::flags;
|
||||
static_assert(iia::count == iib::count, "Cannot compare symmetry elements with different number of indices.");
|
||||
|
||||
constexpr static bool value = is_same<iia, iib>::value;
|
||||
|
||||
private:
|
||||
/* this should be zero if they are identical, or else the tensor
|
||||
* will be forced to be pure real, pure imaginary or even pure zero
|
||||
*/
|
||||
constexpr static int flags_cmp_ = ffa ^ ffb;
|
||||
|
||||
/* either they are not equal, then we don't care whether the flags
|
||||
* match, or they are equal, and then we have to check
|
||||
*/
|
||||
constexpr static bool is_zero = value && flags_cmp_ == NegationFlag;
|
||||
constexpr static bool is_real = value && flags_cmp_ == ConjugationFlag;
|
||||
constexpr static bool is_imag = value && flags_cmp_ == (NegationFlag | ConjugationFlag);
|
||||
|
||||
public:
|
||||
constexpr static int global_flags =
|
||||
(is_real ? GlobalRealFlag : 0) |
|
||||
(is_imag ? GlobalImagFlag : 0) |
|
||||
(is_zero ? GlobalZeroFlag : 0);
|
||||
};
|
||||
|
||||
template<std::size_t NumIndices, typename... Gen>
|
||||
struct tensor_static_symgroup
|
||||
{
|
||||
typedef StaticSGroup<Gen...> type;
|
||||
constexpr static std::size_t size = type::static_size;
|
||||
};
|
||||
|
||||
template<typename Index, std::size_t N, int... ii, int... jj>
|
||||
constexpr static inline std::array<Index, N> tensor_static_symgroup_index_permute(std::array<Index, N> idx, internal::numeric_list<int, ii...>, internal::numeric_list<int, jj...>)
|
||||
{
|
||||
return {{ idx[ii]..., idx[jj]... }};
|
||||
}
|
||||
|
||||
template<typename Index, int... ii>
|
||||
static inline std::vector<Index> tensor_static_symgroup_index_permute(std::vector<Index> idx, internal::numeric_list<int, ii...>)
|
||||
{
|
||||
std::vector<Index> result{{ idx[ii]... }};
|
||||
std::size_t target_size = idx.size();
|
||||
for (std::size_t i = result.size(); i < target_size; i++)
|
||||
result.push_back(idx[i]);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename T> struct tensor_static_symgroup_do_apply;
|
||||
|
||||
template<typename first, typename... next>
|
||||
struct tensor_static_symgroup_do_apply<internal::type_list<first, next...>>
|
||||
{
|
||||
template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices, typename... Args>
|
||||
static inline RV run(const std::array<Index, NumIndices>& idx, RV initial, Args&&... args)
|
||||
{
|
||||
static_assert(NumIndices >= SGNumIndices, "Can only apply symmetry group to objects that have at least the required amount of indices.");
|
||||
typedef typename internal::gen_numeric_list<int, NumIndices - SGNumIndices, SGNumIndices>::type remaining_indices;
|
||||
initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices(), remaining_indices()), first::flags, initial, std::forward<Args>(args)...);
|
||||
return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(idx, initial, args...);
|
||||
}
|
||||
|
||||
template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args>
|
||||
static inline RV run(const std::vector<Index>& idx, RV initial, Args&&... args)
|
||||
{
|
||||
eigen_assert(idx.size() >= SGNumIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
|
||||
initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial, std::forward<Args>(args)...);
|
||||
return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(idx, initial, args...);
|
||||
}
|
||||
};
|
||||
|
||||
template<EIGEN_TPL_PP_SPEC_HACK_DEF(typename, empty)>
|
||||
struct tensor_static_symgroup_do_apply<internal::type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>>
|
||||
{
|
||||
template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices, typename... Args>
|
||||
static inline RV run(const std::array<Index, NumIndices>&, RV initial, Args&&...)
|
||||
{
|
||||
// do nothing
|
||||
return initial;
|
||||
}
|
||||
|
||||
template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args>
|
||||
static inline RV run(const std::vector<Index>&, RV initial, Args&&...)
|
||||
{
|
||||
// do nothing
|
||||
return initial;
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
template<typename... Gen>
|
||||
class StaticSGroup
|
||||
{
|
||||
constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value;
|
||||
typedef internal::group_theory::enumerate_group_elements<
|
||||
internal::tensor_static_symgroup_multiply,
|
||||
internal::tensor_static_symgroup_equality,
|
||||
typename internal::tensor_static_symgroup_identity_ctor<NumIndices>::type,
|
||||
internal::type_list<typename internal::tensor_static_symgroup_element_ctor<Gen, NumIndices>::type...>
|
||||
> group_elements;
|
||||
typedef typename group_elements::type ge;
|
||||
public:
|
||||
constexpr inline StaticSGroup() {}
|
||||
constexpr inline StaticSGroup(const StaticSGroup<Gen...>&) {}
|
||||
constexpr inline StaticSGroup(StaticSGroup<Gen...>&&) {}
|
||||
|
||||
template<typename Op, typename RV, typename Index, std::size_t N, typename... Args>
|
||||
static inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args)
|
||||
{
|
||||
return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...);
|
||||
}
|
||||
|
||||
template<typename Op, typename RV, typename Index, typename... Args>
|
||||
static inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args)
|
||||
{
|
||||
eigen_assert(idx.size() == NumIndices);
|
||||
return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...);
|
||||
}
|
||||
|
||||
constexpr static std::size_t static_size = ge::count;
|
||||
|
||||
constexpr static inline std::size_t size() {
|
||||
return ge::count;
|
||||
}
|
||||
constexpr static inline int globalFlags() { return group_elements::global_flags; }
|
||||
|
||||
template<typename Tensor_, typename... IndexTypes>
|
||||
inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
|
||||
{
|
||||
static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
|
||||
return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
|
||||
}
|
||||
|
||||
template<typename Tensor_>
|
||||
inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const
|
||||
{
|
||||
return internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>>(tensor, *this, indices);
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
|
||||
|
||||
/*
|
||||
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
|
||||
*/
|
|
@ -0,0 +1,338 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
|
||||
#define EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
enum {
|
||||
NegationFlag = 0x01,
|
||||
ConjugationFlag = 0x02
|
||||
};
|
||||
|
||||
enum {
|
||||
GlobalRealFlag = 0x01,
|
||||
GlobalImagFlag = 0x02,
|
||||
GlobalZeroFlag = 0x03
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<std::size_t NumIndices, typename... Sym> struct tensor_symmetry_pre_analysis;
|
||||
template<std::size_t NumIndices, typename... Sym> struct tensor_static_symgroup;
|
||||
template<bool instantiate, std::size_t NumIndices, typename... Sym> struct tensor_static_symgroup_if;
|
||||
template<typename Tensor_> struct tensor_symmetry_calculate_flags;
|
||||
template<typename Tensor_> struct tensor_symmetry_assign_value;
|
||||
template<typename... Sym> struct tensor_symmetry_num_indices;
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
template<int One_, int Two_>
|
||||
struct Symmetry
|
||||
{
|
||||
static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
|
||||
constexpr static int One = One_;
|
||||
constexpr static int Two = Two_;
|
||||
constexpr static int Flags = 0;
|
||||
};
|
||||
|
||||
template<int One_, int Two_>
|
||||
struct AntiSymmetry
|
||||
{
|
||||
static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
|
||||
constexpr static int One = One_;
|
||||
constexpr static int Two = Two_;
|
||||
constexpr static int Flags = NegationFlag;
|
||||
};
|
||||
|
||||
template<int One_, int Two_>
|
||||
struct Hermiticity
|
||||
{
|
||||
static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
|
||||
constexpr static int One = One_;
|
||||
constexpr static int Two = Two_;
|
||||
constexpr static int Flags = ConjugationFlag;
|
||||
};
|
||||
|
||||
template<int One_, int Two_>
|
||||
struct AntiHermiticity
|
||||
{
|
||||
static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
|
||||
constexpr static int One = One_;
|
||||
constexpr static int Two = Two_;
|
||||
constexpr static int Flags = ConjugationFlag | NegationFlag;
|
||||
};
|
||||
|
||||
/** \class DynamicSGroup
|
||||
* \ingroup TensorSymmetry_Module
|
||||
*
|
||||
* \brief Dynamic symmetry group
|
||||
*
|
||||
* The %DynamicSGroup class represents a symmetry group that need not be known at
|
||||
* compile time. It is useful if one wants to support arbitrary run-time defineable
|
||||
* symmetries for tensors, but it is also instantiated if a symmetry group is defined
|
||||
* at compile time that would be either too large for the compiler to reasonably
|
||||
* generate (using templates to calculate this at compile time is very inefficient)
|
||||
* or that the compiler could generate the group but that it wouldn't make sense to
|
||||
* unroll the loop for setting coefficients anymore.
|
||||
*/
|
||||
class DynamicSGroup;
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class DynamicSGroupFromTemplateArgs
|
||||
* \ingroup TensorSymmetry_Module
|
||||
*
|
||||
* \brief Dynamic symmetry group, initialized from template arguments
|
||||
*
|
||||
* This class is a child class of DynamicSGroup. It uses the template arguments
|
||||
* specified to initialize itself.
|
||||
*/
|
||||
template<typename... Gen>
|
||||
class DynamicSGroupFromTemplateArgs;
|
||||
|
||||
/** \class StaticSGroup
|
||||
* \ingroup TensorSymmetry_Module
|
||||
*
|
||||
* \brief Static symmetry group
|
||||
*
|
||||
* This class represents a symmetry group that is known and resolved completely
|
||||
* at compile time. Ideally, no run-time penalty is incurred compared to the
|
||||
* manual unrolling of the symmetry.
|
||||
*
|
||||
* <b><i>CAUTION:</i></b>
|
||||
*
|
||||
* Do not use this class directly for large symmetry groups. The compiler
|
||||
* may run into a limit, or segfault or in the very least will take a very,
|
||||
* very, very long time to compile the code. Use the SGroup class instead
|
||||
* if you want a static group. That class contains logic that will
|
||||
* automatically select the DynamicSGroup class instead if the symmetry
|
||||
* group becomes too large. (In that case, unrolling may not even be
|
||||
* beneficial.)
|
||||
*/
|
||||
template<typename... Gen>
|
||||
class StaticSGroup;
|
||||
|
||||
/** \class SGroup
|
||||
* \ingroup TensorSymmetry_Module
|
||||
*
|
||||
* \brief Symmetry group, initialized from template arguments
|
||||
*
|
||||
* This class represents a symmetry group whose generators are already
|
||||
* known at compile time. It may or may not be resolved at compile time,
|
||||
* depending on the estimated size of the group.
|
||||
*
|
||||
* \sa StaticSGroup
|
||||
* \sa DynamicSGroup
|
||||
*/
|
||||
template<typename... Gen>
|
||||
class SGroup : public internal::tensor_symmetry_pre_analysis<internal::tensor_symmetry_num_indices<Gen...>::value, Gen...>::root_type
|
||||
{
|
||||
public:
|
||||
constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value;
|
||||
typedef typename internal::tensor_symmetry_pre_analysis<NumIndices, Gen...>::root_type Base;
|
||||
|
||||
// make standard constructors + assignment operators public
|
||||
inline SGroup() : Base() { }
|
||||
inline SGroup(const SGroup<Gen...>& other) : Base(other) { }
|
||||
inline SGroup(SGroup<Gen...>&& other) : Base(other) { }
|
||||
inline SGroup<Gen...>& operator=(const SGroup<Gen...>& other) { Base::operator=(other); return *this; }
|
||||
inline SGroup<Gen...>& operator=(SGroup<Gen...>&& other) { Base::operator=(other); return *this; }
|
||||
|
||||
// all else is defined in the base class
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename... Sym> struct tensor_symmetry_num_indices
|
||||
{
|
||||
constexpr static std::size_t value = 1;
|
||||
};
|
||||
|
||||
template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...>
|
||||
{
|
||||
private:
|
||||
constexpr static std::size_t One = static_cast<std::size_t>(One_);
|
||||
constexpr static std::size_t Two = static_cast<std::size_t>(Two_);
|
||||
constexpr static std::size_t Three = tensor_symmetry_num_indices<Sym...>::value;
|
||||
|
||||
// don't use std::max, since it's not constexpr until C++14...
|
||||
constexpr static std::size_t maxOneTwoPlusOne = ((One > Two) ? One : Two) + 1;
|
||||
public:
|
||||
constexpr static std::size_t value = (maxOneTwoPlusOne > Three) ? maxOneTwoPlusOne : Three;
|
||||
};
|
||||
|
||||
template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<AntiSymmetry<One_, Two_>, Sym...>
|
||||
: public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
|
||||
template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<Hermiticity<One_, Two_>, Sym...>
|
||||
: public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
|
||||
template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<AntiHermiticity<One_, Two_>, Sym...>
|
||||
: public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class tensor_symmetry_pre_analysis
|
||||
* \ingroup TensorSymmetry_Module
|
||||
*
|
||||
* \brief Pre-select whether to use a static or dynamic symmetry group
|
||||
*
|
||||
* When a symmetry group could in principle be determined at compile time,
|
||||
* this template implements the logic whether to actually do that or whether
|
||||
* to rather defer that to runtime.
|
||||
*
|
||||
* The logic is as follows:
|
||||
* <dl>
|
||||
* <dt><b>No generators (trivial symmetry):</b></dt>
|
||||
* <dd>Use a trivial static group. Ideally, this has no performance impact
|
||||
* compared to not using symmetry at all. In practice, this might not
|
||||
* be the case.</dd>
|
||||
* <dt><b>More than 4 generators:</b></dt>
|
||||
* <dd>Calculate the group at run time, it is likely far too large for the
|
||||
* compiler to be able to properly generate it in a realistic time.</dd>
|
||||
* <dt><b>Up to and including 4 generators:</b></dt>
|
||||
* <dd>Actually enumerate all group elements, but then check how many there
|
||||
* are. If there are more than 16, it is unlikely that unrolling the
|
||||
* loop (as is done in the static compile-time case) is sensible, so
|
||||
* use a dynamic group instead. If there are at most 16 elements, actually
|
||||
* use that static group. Note that the largest group with 4 generators
|
||||
* still compiles with reasonable resources.</dd>
|
||||
* </dl>
|
||||
*
|
||||
* Note: Example compile time performance with g++-4.6 on an Intenl Core i5-3470
|
||||
* with 16 GiB RAM (all generators non-redundant and the subgroups don't
|
||||
* factorize):
|
||||
*
|
||||
* # Generators -O0 -ggdb -O2
|
||||
* -------------------------------------------------------------------
|
||||
* 1 0.5 s / 250 MiB 0.45s / 230 MiB
|
||||
* 2 0.5 s / 260 MiB 0.5 s / 250 MiB
|
||||
* 3 0.65s / 310 MiB 0.62s / 310 MiB
|
||||
* 4 2.2 s / 860 MiB 1.7 s / 770 MiB
|
||||
* 5 130 s / 13000 MiB 120 s / 11000 MiB
|
||||
*
|
||||
* It is clear that everything is still very efficient up to 4 generators, then
|
||||
* the memory and CPU requirements become unreasonable. Thus we only instantiate
|
||||
* the template group theory logic if the number of generators supplied is 4 or
|
||||
* lower, otherwise this will be forced to be done during runtime, where the
|
||||
* algorithm is reasonably fast.
|
||||
*/
|
||||
template<std::size_t NumIndices>
|
||||
struct tensor_symmetry_pre_analysis<NumIndices>
|
||||
{
|
||||
typedef StaticSGroup<> root_type;
|
||||
};
|
||||
|
||||
template<std::size_t NumIndices, typename Gen_, typename... Gens_>
|
||||
struct tensor_symmetry_pre_analysis<NumIndices, Gen_, Gens_...>
|
||||
{
|
||||
constexpr static std::size_t max_static_generators = 4;
|
||||
constexpr static std::size_t max_static_elements = 16;
|
||||
typedef tensor_static_symgroup_if<(sizeof...(Gens_) + 1 <= max_static_generators), NumIndices, Gen_, Gens_...> helper;
|
||||
constexpr static std::size_t possible_size = helper::size;
|
||||
|
||||
typedef typename conditional<
|
||||
possible_size == 0 || possible_size >= max_static_elements,
|
||||
DynamicSGroupFromTemplateArgs<Gen_, Gens_...>,
|
||||
typename helper::type
|
||||
>::type root_type;
|
||||
};
|
||||
|
||||
template<bool instantiate, std::size_t NumIndices, typename... Gens>
|
||||
struct tensor_static_symgroup_if
|
||||
{
|
||||
constexpr static std::size_t size = 0;
|
||||
typedef void type;
|
||||
};
|
||||
|
||||
template<std::size_t NumIndices, typename... Gens>
|
||||
struct tensor_static_symgroup_if<true, NumIndices, Gens...> : tensor_static_symgroup<NumIndices, Gens...> {};
|
||||
|
||||
template<typename Tensor_>
|
||||
struct tensor_symmetry_assign_value
|
||||
{
|
||||
typedef typename Tensor_::Index Index;
|
||||
typedef typename Tensor_::Scalar Scalar;
|
||||
constexpr static std::size_t NumIndices = Tensor_::NumIndices;
|
||||
|
||||
static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transformation_flags, int dummy, Tensor_& tensor, const Scalar& value_)
|
||||
{
|
||||
Scalar value(value_);
|
||||
if (transformation_flags & ConjugationFlag)
|
||||
value = numext::conj(value);
|
||||
if (transformation_flags & NegationFlag)
|
||||
value = -value;
|
||||
tensor.coeffRef(transformed_indices) = value;
|
||||
return dummy;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Tensor_>
|
||||
struct tensor_symmetry_calculate_flags
|
||||
{
|
||||
typedef typename Tensor_::Index Index;
|
||||
constexpr static std::size_t NumIndices = Tensor_::NumIndices;
|
||||
|
||||
static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transform_flags, int current_flags, const std::array<Index, NumIndices>& orig_indices)
|
||||
{
|
||||
if (transformed_indices == orig_indices) {
|
||||
if (transform_flags & (ConjugationFlag | NegationFlag))
|
||||
return current_flags | GlobalImagFlag; // anti-hermitian diagonal
|
||||
else if (transform_flags & ConjugationFlag)
|
||||
return current_flags | GlobalRealFlag; // hermitian diagonal
|
||||
else if (transform_flags & NegationFlag)
|
||||
return current_flags | GlobalZeroFlag; // anti-symmetric diagonal
|
||||
}
|
||||
return current_flags;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Tensor_, typename Symmetry_, int Flags = 0>
|
||||
class tensor_symmetry_value_setter
|
||||
{
|
||||
public:
|
||||
typedef typename Tensor_::Index Index;
|
||||
typedef typename Tensor_::Scalar Scalar;
|
||||
constexpr static std::size_t NumIndices = Tensor_::NumIndices;
|
||||
|
||||
inline tensor_symmetry_value_setter(Tensor_& tensor, Symmetry_ const& symmetry, std::array<Index, NumIndices> const& indices)
|
||||
: m_tensor(tensor), m_symmetry(symmetry), m_indices(indices) { }
|
||||
|
||||
inline tensor_symmetry_value_setter<Tensor_, Symmetry_, Flags>& operator=(Scalar const& value)
|
||||
{
|
||||
doAssign(value);
|
||||
return *this;
|
||||
}
|
||||
private:
|
||||
Tensor_& m_tensor;
|
||||
Symmetry_ m_symmetry;
|
||||
std::array<Index, NumIndices> m_indices;
|
||||
|
||||
inline void doAssign(Scalar const& value)
|
||||
{
|
||||
#ifdef EIGEN_TENSOR_SYMMETRY_CHECK_VALUES
|
||||
int value_flags = m_symmetry.template apply<internal::tensor_symmetry_calculate_flags<Tensor_>, int>(m_indices, m_symmetry.globalFlags(), m_indices);
|
||||
if (value_flags & GlobalRealFlag)
|
||||
eigen_assert(numext::imag(value) == 0);
|
||||
if (value_flags & GlobalImagFlag)
|
||||
eigen_assert(numext::real(value) == 0);
|
||||
#endif
|
||||
m_symmetry.template apply<internal::tensor_symmetry_assign_value<Tensor_>, int>(m_indices, 0, m_tensor, value);
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
|
||||
|
||||
/*
|
||||
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
|
||||
*/
|
|
@ -0,0 +1,669 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
|
||||
#define EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
namespace group_theory {
|
||||
|
||||
/** \internal
|
||||
* \file CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
|
||||
* This file contains C++ templates that implement group theory algorithms.
|
||||
*
|
||||
* The algorithms allow for a compile-time analysis of finite groups.
|
||||
*
|
||||
* Currently only Dimino's algorithm is implemented, which returns a list
|
||||
* of all elements in a group given a set of (possibly redundant) generators.
|
||||
* (One could also do that with the so-called orbital algorithm, but that
|
||||
* is much more expensive and usually has no advantages.)
|
||||
*/
|
||||
|
||||
/**********************************************************************
|
||||
* "Ok kid, here is where it gets complicated."
|
||||
* - Amelia Pond in the "Doctor Who" episode
|
||||
* "The Big Bang"
|
||||
*
|
||||
* Dimino's algorithm
|
||||
* ==================
|
||||
*
|
||||
* The following is Dimino's algorithm in sequential form:
|
||||
*
|
||||
* Input: identity element, list of generators, equality check,
|
||||
* multiplication operation
|
||||
* Output: list of group elements
|
||||
*
|
||||
* 1. add identity element
|
||||
* 2. remove identities from list of generators
|
||||
* 3. add all powers of first generator that aren't the
|
||||
* identity element
|
||||
* 4. go through all remaining generators:
|
||||
* a. if generator is already in the list of elements
|
||||
* -> do nothing
|
||||
* b. otherwise
|
||||
* i. remember current # of elements
|
||||
* (i.e. the size of the current subgroup)
|
||||
* ii. add all current elements (which includes
|
||||
* the identity) each multiplied from right
|
||||
* with the current generator to the group
|
||||
* iii. add all remaining cosets that are generated
|
||||
* by products of the new generator with itself
|
||||
* and all other generators seen so far
|
||||
*
|
||||
* In functional form, this is implemented as a long set of recursive
|
||||
* templates that have a complicated relationship.
|
||||
*
|
||||
* The main interface for Dimino's algorithm is the template
|
||||
* enumerate_group_elements. All lists are implemented as variadic
|
||||
* type_list<typename...> and numeric_list<typename = int, int...>
|
||||
* templates.
|
||||
*
|
||||
* 'Calling' templates is usually done via typedefs.
|
||||
*
|
||||
* This algorithm is an extended version of the basic version. The
|
||||
* extension consists in the fact that each group element has a set
|
||||
* of flags associated with it. Multiplication of two group elements
|
||||
* with each other results in a group element whose flags are the
|
||||
* XOR of the flags of the previous elements. Each time the algorithm
|
||||
* notices that a group element it just calculated is already in the
|
||||
* list of current elements, the flags of both will be compared and
|
||||
* added to the so-called 'global flags' of the group.
|
||||
*
|
||||
* The rationale behind this extension is that this allows not only
|
||||
* for the description of symmetries between tensor indices, but
|
||||
* also allows for the description of hermiticity, antisymmetry and
|
||||
* antihermiticity. Negation and conjugation each are specific bit
|
||||
* in the flags value and if two different ways to reach a group
|
||||
* element lead to two different flags, this poses a constraint on
|
||||
* the allowed values of the resulting tensor. For example, if a
|
||||
* group element is reach both with and without the conjugation
|
||||
* flags, it is clear that the resulting tensor has to be real.
|
||||
*
|
||||
* Note that this flag mechanism is quite generic and may have other
|
||||
* uses beyond tensor properties.
|
||||
*
|
||||
* IMPORTANT:
|
||||
* This algorithm assumes the group to be finite. If you try to
|
||||
* run it with a group that's infinite, the algorithm will only
|
||||
* terminate once you hit a compiler limit (max template depth).
|
||||
* Also note that trying to use this implementation to create a
|
||||
* very large group will probably either make you hit the same
|
||||
* limit, cause the compiler to segfault or at the very least
|
||||
* take a *really* long time (hours, days, weeks - sic!) to
|
||||
* compile. It is not recommended to plug in more than 4
|
||||
* generators, unless they are independent of each other.
|
||||
*/
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class strip_identities
|
||||
* \ingroup CXX11_TensorSymmetry_Module
|
||||
*
|
||||
* \brief Cleanse a list of group elements of the identity element
|
||||
*
|
||||
* This template is used to make a first pass through all initial
|
||||
* generators of Dimino's algorithm and remove the identity
|
||||
* elements.
|
||||
*
|
||||
* \sa enumerate_group_elements
|
||||
*/
|
||||
template<template<typename, typename> class Equality, typename id, typename L> struct strip_identities;
|
||||
|
||||
template<
|
||||
template<typename, typename> class Equality,
|
||||
typename id,
|
||||
typename t,
|
||||
typename... ts
|
||||
>
|
||||
struct strip_identities<Equality, id, type_list<t, ts...>>
|
||||
{
|
||||
typedef typename conditional<
|
||||
Equality<id, t>::value,
|
||||
typename strip_identities<Equality, id, type_list<ts...>>::type,
|
||||
typename concat<type_list<t>, typename strip_identities<Equality, id, type_list<ts...>>::type>::type
|
||||
>::type type;
|
||||
constexpr static int global_flags = Equality<id, t>::global_flags | strip_identities<Equality, id, type_list<ts...>>::global_flags;
|
||||
};
|
||||
|
||||
template<
|
||||
template<typename, typename> class Equality,
|
||||
typename id
|
||||
EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, ts)
|
||||
>
|
||||
struct strip_identities<Equality, id, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(ts)>>
|
||||
{
|
||||
typedef type_list<> type;
|
||||
constexpr static int global_flags = 0;
|
||||
};
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class dimino_first_step_elements_helper
|
||||
* \ingroup CXX11_TensorSymmetry_Module
|
||||
*
|
||||
* \brief Recursive template that adds powers of the first generator to the list of group elements
|
||||
*
|
||||
* This template calls itself recursively to add powers of the first
|
||||
* generator to the list of group elements. It stops if it reaches
|
||||
* the identity element again.
|
||||
*
|
||||
* \sa enumerate_group_elements, dimino_first_step_elements
|
||||
*/
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
template<typename, typename> class Equality,
|
||||
typename id,
|
||||
typename g,
|
||||
typename current_element,
|
||||
typename elements,
|
||||
bool dont_add_current_element // = false
|
||||
>
|
||||
struct dimino_first_step_elements_helper
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
: // recursive inheritance is too difficult for Doxygen
|
||||
public dimino_first_step_elements_helper<
|
||||
Multiply,
|
||||
Equality,
|
||||
id,
|
||||
g,
|
||||
typename Multiply<current_element, g>::type,
|
||||
typename concat<elements, type_list<current_element>>::type,
|
||||
Equality<typename Multiply<current_element, g>::type, id>::value
|
||||
> {};
|
||||
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
template<typename, typename> class Equality,
|
||||
typename id,
|
||||
typename g,
|
||||
typename current_element,
|
||||
typename elements
|
||||
>
|
||||
struct dimino_first_step_elements_helper<Multiply, Equality, id, g, current_element, elements, true>
|
||||
#endif // EIGEN_PARSED_BY_DOXYGEN
|
||||
{
|
||||
typedef elements type;
|
||||
constexpr static int global_flags = Equality<current_element, id>::global_flags;
|
||||
};
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class dimino_first_step_elements
|
||||
* \ingroup CXX11_TensorSymmetry_Module
|
||||
*
|
||||
* \brief Add all powers of the first generator to the list of group elements
|
||||
*
|
||||
* This template takes the first non-identity generator and generates the initial
|
||||
* list of elements which consists of all powers of that generator. For a group
|
||||
* with just one generated, it would be enumerated after this.
|
||||
*
|
||||
* \sa enumerate_group_elements
|
||||
*/
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
template<typename, typename> class Equality,
|
||||
typename id,
|
||||
typename generators
|
||||
>
|
||||
struct dimino_first_step_elements
|
||||
{
|
||||
typedef typename get<0, generators>::type first_generator;
|
||||
typedef typename skip<1, generators>::type next_generators;
|
||||
typedef type_list<first_generator> generators_done;
|
||||
|
||||
typedef dimino_first_step_elements_helper<
|
||||
Multiply,
|
||||
Equality,
|
||||
id,
|
||||
first_generator,
|
||||
first_generator,
|
||||
type_list<id>,
|
||||
false
|
||||
> helper;
|
||||
typedef typename helper::type type;
|
||||
constexpr static int global_flags = helper::global_flags;
|
||||
};
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class dimino_get_coset_elements
|
||||
* \ingroup CXX11_TensorSymmetry_Module
|
||||
*
|
||||
* \brief Generate all elements of a specific coset
|
||||
*
|
||||
* This template generates all the elements of a specific coset by
|
||||
* multiplying all elements in the given subgroup with the new
|
||||
* coset representative. Note that the first element of the
|
||||
* subgroup is always the identity element, so the first element of
|
||||
* ther result of this template is going to be the coset
|
||||
* representative itself.
|
||||
*
|
||||
* Note that this template accepts an additional boolean parameter
|
||||
* that specifies whether to actually generate the coset (true) or
|
||||
* just return an empty list (false).
|
||||
*
|
||||
* \sa enumerate_group_elements, dimino_add_cosets_for_rep
|
||||
*/
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
typename sub_group_elements,
|
||||
typename new_coset_rep,
|
||||
bool generate_coset // = true
|
||||
>
|
||||
struct dimino_get_coset_elements
|
||||
{
|
||||
typedef typename apply_op_from_right<Multiply, new_coset_rep, sub_group_elements>::type type;
|
||||
};
|
||||
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
typename sub_group_elements,
|
||||
typename new_coset_rep
|
||||
>
|
||||
struct dimino_get_coset_elements<Multiply, sub_group_elements, new_coset_rep, false>
|
||||
{
|
||||
typedef type_list<> type;
|
||||
};
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class dimino_add_cosets_for_rep
|
||||
* \ingroup CXX11_TensorSymmetry_Module
|
||||
*
|
||||
* \brief Recursive template for adding coset spaces
|
||||
*
|
||||
* This template multiplies the coset representative with a generator
|
||||
* from the list of previous generators. If the new element is not in
|
||||
* the group already, it adds the corresponding coset. Finally it
|
||||
* proceeds to call itself with the next generator from the list.
|
||||
*
|
||||
* \sa enumerate_group_elements, dimino_add_all_coset_spaces
|
||||
*/
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
template<typename, typename> class Equality,
|
||||
typename id,
|
||||
typename sub_group_elements,
|
||||
typename elements,
|
||||
typename generators,
|
||||
typename rep_element,
|
||||
int sub_group_size
|
||||
>
|
||||
struct dimino_add_cosets_for_rep;
|
||||
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
template<typename, typename> class Equality,
|
||||
typename id,
|
||||
typename sub_group_elements,
|
||||
typename elements,
|
||||
typename g,
|
||||
typename... gs,
|
||||
typename rep_element,
|
||||
int sub_group_size
|
||||
>
|
||||
struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<g, gs...>, rep_element, sub_group_size>
|
||||
{
|
||||
typedef typename Multiply<rep_element, g>::type new_coset_rep;
|
||||
typedef contained_in_list_gf<Equality, new_coset_rep, elements> _cil;
|
||||
constexpr static bool add_coset = !_cil::value;
|
||||
|
||||
typedef typename dimino_get_coset_elements<
|
||||
Multiply,
|
||||
sub_group_elements,
|
||||
new_coset_rep,
|
||||
add_coset
|
||||
>::type coset_elements;
|
||||
|
||||
typedef dimino_add_cosets_for_rep<
|
||||
Multiply,
|
||||
Equality,
|
||||
id,
|
||||
sub_group_elements,
|
||||
typename concat<elements, coset_elements>::type,
|
||||
type_list<gs...>,
|
||||
rep_element,
|
||||
sub_group_size
|
||||
> _helper;
|
||||
|
||||
typedef typename _helper::type type;
|
||||
constexpr static int global_flags = _cil::global_flags | _helper::global_flags;
|
||||
|
||||
/* Note that we don't have to update global flags here, since
|
||||
* we will only add these elements if they are not part of
|
||||
* the group already. But that only happens if the coset rep
|
||||
* is not already in the group, so the check for the coset rep
|
||||
* will catch this.
|
||||
*/
|
||||
};
|
||||
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
template<typename, typename> class Equality,
|
||||
typename id,
|
||||
typename sub_group_elements,
|
||||
typename elements
|
||||
EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty),
|
||||
typename rep_element,
|
||||
int sub_group_size
|
||||
>
|
||||
struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, rep_element, sub_group_size>
|
||||
{
|
||||
typedef elements type;
|
||||
constexpr static int global_flags = 0;
|
||||
};
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class dimino_add_all_coset_spaces
|
||||
* \ingroup CXX11_TensorSymmetry_Module
|
||||
*
|
||||
* \brief Recursive template for adding all coset spaces for a new generator
|
||||
*
|
||||
* This template tries to go through the list of generators (with
|
||||
* the help of the dimino_add_cosets_for_rep template) as long as
|
||||
* it still finds elements that are not part of the group and add
|
||||
* the corresponding cosets.
|
||||
*
|
||||
* \sa enumerate_group_elements, dimino_add_cosets_for_rep
|
||||
*/
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
template<typename, typename> class Equality,
|
||||
typename id,
|
||||
typename sub_group_elements,
|
||||
typename elements,
|
||||
typename generators,
|
||||
int sub_group_size,
|
||||
int rep_pos,
|
||||
bool stop_condition // = false
|
||||
>
|
||||
struct dimino_add_all_coset_spaces
|
||||
{
|
||||
typedef typename get<rep_pos, elements>::type rep_element;
|
||||
typedef dimino_add_cosets_for_rep<
|
||||
Multiply,
|
||||
Equality,
|
||||
id,
|
||||
sub_group_elements,
|
||||
elements,
|
||||
generators,
|
||||
rep_element,
|
||||
sub_group_elements::count
|
||||
> _ac4r;
|
||||
typedef typename _ac4r::type new_elements;
|
||||
|
||||
constexpr static int new_rep_pos = rep_pos + sub_group_elements::count;
|
||||
constexpr static bool new_stop_condition = new_rep_pos >= new_elements::count;
|
||||
|
||||
typedef dimino_add_all_coset_spaces<
|
||||
Multiply,
|
||||
Equality,
|
||||
id,
|
||||
sub_group_elements,
|
||||
new_elements,
|
||||
generators,
|
||||
sub_group_size,
|
||||
new_rep_pos,
|
||||
new_stop_condition
|
||||
> _helper;
|
||||
|
||||
typedef typename _helper::type type;
|
||||
constexpr static int global_flags = _helper::global_flags | _ac4r::global_flags;
|
||||
};
|
||||
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
template<typename, typename> class Equality,
|
||||
typename id,
|
||||
typename sub_group_elements,
|
||||
typename elements,
|
||||
typename generators,
|
||||
int sub_group_size,
|
||||
int rep_pos
|
||||
>
|
||||
struct dimino_add_all_coset_spaces<Multiply, Equality, id, sub_group_elements, elements, generators, sub_group_size, rep_pos, true>
|
||||
{
|
||||
typedef elements type;
|
||||
constexpr static int global_flags = 0;
|
||||
};
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class dimino_add_generator
|
||||
* \ingroup CXX11_TensorSymmetry_Module
|
||||
*
|
||||
* \brief Enlarge the group by adding a new generator.
|
||||
*
|
||||
* It accepts a boolean parameter that determines if the generator is redundant,
|
||||
* i.e. was already seen in the group. In that case, it reduces to a no-op.
|
||||
*
|
||||
* \sa enumerate_group_elements, dimino_add_all_coset_spaces
|
||||
*/
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
template<typename, typename> class Equality,
|
||||
typename id,
|
||||
typename elements,
|
||||
typename generators_done,
|
||||
typename current_generator,
|
||||
bool redundant // = false
|
||||
>
|
||||
struct dimino_add_generator
|
||||
{
|
||||
/* this template is only called if the generator is not redundant
|
||||
* => all elements of the group multiplied with the new generator
|
||||
* are going to be new elements of the most trivial coset space
|
||||
*/
|
||||
typedef typename apply_op_from_right<Multiply, current_generator, elements>::type multiplied_elements;
|
||||
typedef typename concat<elements, multiplied_elements>::type new_elements;
|
||||
|
||||
constexpr static int rep_pos = elements::count;
|
||||
|
||||
typedef dimino_add_all_coset_spaces<
|
||||
Multiply,
|
||||
Equality,
|
||||
id,
|
||||
elements, // elements of previous subgroup
|
||||
new_elements,
|
||||
typename concat<generators_done, type_list<current_generator>>::type,
|
||||
elements::count, // size of previous subgroup
|
||||
rep_pos,
|
||||
false // don't stop (because rep_pos >= new_elements::count is always false at this point)
|
||||
> _helper;
|
||||
typedef typename _helper::type type;
|
||||
constexpr static int global_flags = _helper::global_flags;
|
||||
};
|
||||
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
template<typename, typename> class Equality,
|
||||
typename id,
|
||||
typename elements,
|
||||
typename generators_done,
|
||||
typename current_generator
|
||||
>
|
||||
struct dimino_add_generator<Multiply, Equality, id, elements, generators_done, current_generator, true>
|
||||
{
|
||||
// redundant case
|
||||
typedef elements type;
|
||||
constexpr static int global_flags = 0;
|
||||
};
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class dimino_add_remaining_generators
|
||||
* \ingroup CXX11_TensorSymmetry_Module
|
||||
*
|
||||
* \brief Recursive template that adds all remaining generators to a group
|
||||
*
|
||||
* Loop through the list of generators that remain and successively
|
||||
* add them to the group.
|
||||
*
|
||||
* \sa enumerate_group_elements, dimino_add_generator
|
||||
*/
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
template<typename, typename> class Equality,
|
||||
typename id,
|
||||
typename generators_done,
|
||||
typename remaining_generators,
|
||||
typename elements
|
||||
>
|
||||
struct dimino_add_remaining_generators
|
||||
{
|
||||
typedef typename get<0, remaining_generators>::type first_generator;
|
||||
typedef typename skip<1, remaining_generators>::type next_generators;
|
||||
|
||||
typedef contained_in_list_gf<Equality, first_generator, elements> _cil;
|
||||
|
||||
typedef dimino_add_generator<
|
||||
Multiply,
|
||||
Equality,
|
||||
id,
|
||||
elements,
|
||||
generators_done,
|
||||
first_generator,
|
||||
_cil::value
|
||||
> _helper;
|
||||
|
||||
typedef typename _helper::type new_elements;
|
||||
|
||||
typedef dimino_add_remaining_generators<
|
||||
Multiply,
|
||||
Equality,
|
||||
id,
|
||||
typename concat<generators_done, type_list<first_generator>>::type,
|
||||
next_generators,
|
||||
new_elements
|
||||
> _next_iter;
|
||||
|
||||
typedef typename _next_iter::type type;
|
||||
constexpr static int global_flags =
|
||||
_cil::global_flags |
|
||||
_helper::global_flags |
|
||||
_next_iter::global_flags;
|
||||
};
|
||||
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
template<typename, typename> class Equality,
|
||||
typename id,
|
||||
typename generators_done,
|
||||
typename elements
|
||||
>
|
||||
struct dimino_add_remaining_generators<Multiply, Equality, id, generators_done, type_list<>, elements>
|
||||
{
|
||||
typedef elements type;
|
||||
constexpr static int global_flags = 0;
|
||||
};
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class enumerate_group_elements_noid
|
||||
* \ingroup CXX11_TensorSymmetry_Module
|
||||
*
|
||||
* \brief Helper template that implements group element enumeration
|
||||
*
|
||||
* This is a helper template that implements the actual enumeration
|
||||
* of group elements. This has been split so that the list of
|
||||
* generators can be cleansed of the identity element before
|
||||
* performing the actual operation.
|
||||
*
|
||||
* \sa enumerate_group_elements
|
||||
*/
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
template<typename, typename> class Equality,
|
||||
typename id,
|
||||
typename generators,
|
||||
int initial_global_flags = 0
|
||||
>
|
||||
struct enumerate_group_elements_noid
|
||||
{
|
||||
typedef dimino_first_step_elements<Multiply, Equality, id, generators> first_step;
|
||||
typedef typename first_step::type first_step_elements;
|
||||
|
||||
typedef dimino_add_remaining_generators<
|
||||
Multiply,
|
||||
Equality,
|
||||
id,
|
||||
typename first_step::generators_done,
|
||||
typename first_step::next_generators, // remaining_generators
|
||||
typename first_step::type // first_step elements
|
||||
> _helper;
|
||||
|
||||
typedef typename _helper::type type;
|
||||
constexpr static int global_flags =
|
||||
initial_global_flags |
|
||||
first_step::global_flags |
|
||||
_helper::global_flags;
|
||||
};
|
||||
|
||||
// in case when no generators are specified
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
template<typename, typename> class Equality,
|
||||
typename id,
|
||||
int initial_global_flags
|
||||
>
|
||||
struct enumerate_group_elements_noid<Multiply, Equality, id, type_list<>, initial_global_flags>
|
||||
{
|
||||
typedef type_list<id> type;
|
||||
constexpr static int global_flags = initial_global_flags;
|
||||
};
|
||||
|
||||
/** \internal
|
||||
*
|
||||
* \class enumerate_group_elements
|
||||
* \ingroup CXX11_TensorSymmetry_Module
|
||||
*
|
||||
* \brief Enumerate all elements in a finite group
|
||||
*
|
||||
* This template enumerates all elements in a finite group. It accepts
|
||||
* the following template parameters:
|
||||
*
|
||||
* \tparam Multiply The multiplication operation that multiplies two group elements
|
||||
* with each other.
|
||||
* \tparam Equality The equality check operation that checks if two group elements
|
||||
* are equal to another.
|
||||
* \tparam id The identity element
|
||||
* \tparam _generators A list of (possibly redundant) generators of the group
|
||||
*/
|
||||
template<
|
||||
template<typename, typename> class Multiply,
|
||||
template<typename, typename> class Equality,
|
||||
typename id,
|
||||
typename _generators
|
||||
>
|
||||
struct enumerate_group_elements
|
||||
: public enumerate_group_elements_noid<
|
||||
Multiply,
|
||||
Equality,
|
||||
id,
|
||||
typename strip_identities<Equality, id, _generators>::type,
|
||||
strip_identities<Equality, id, _generators>::global_flags
|
||||
>
|
||||
{
|
||||
};
|
||||
|
||||
} // end namespace group_theory
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
|
||||
|
||||
/*
|
||||
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
|
||||
*/
|
|
@ -0,0 +1,233 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
|
||||
#define EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// EventCount allows to wait for arbitrary predicates in non-blocking
|
||||
// algorithms. Think of condition variable, but wait predicate does not need to
|
||||
// be protected by a mutex. Usage:
|
||||
// Waiting thread does:
|
||||
//
|
||||
// if (predicate)
|
||||
// return act();
|
||||
// EventCount::Waiter& w = waiters[my_index];
|
||||
// ec.Prewait(&w);
|
||||
// if (predicate) {
|
||||
// ec.CancelWait(&w);
|
||||
// return act();
|
||||
// }
|
||||
// ec.CommitWait(&w);
|
||||
//
|
||||
// Notifying thread does:
|
||||
//
|
||||
// predicate = true;
|
||||
// ec.Notify(true);
|
||||
//
|
||||
// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
|
||||
// cheap, but they are executed only if the preceeding predicate check has
|
||||
// failed.
|
||||
//
|
||||
// Algorihtm outline:
|
||||
// There are two main variables: predicate (managed by user) and state_.
|
||||
// Operation closely resembles Dekker mutual algorithm:
|
||||
// xxxps://en.wikipedia.org/wiki/Dekker%27s_algorithm
|
||||
// Waiting thread sets state_ then checks predicate, Notifying thread sets
|
||||
// predicate then checks state_. Due to seq_cst fences in between these
|
||||
// operations it is guaranteed than either waiter will see predicate change
|
||||
// and won't block, or notifying thread will see state_ change and will unblock
|
||||
// the waiter, or both. But it can't happen that both threads don't see each
|
||||
// other changes, which would lead to deadlock.
|
||||
class EventCount {
|
||||
public:
|
||||
class Waiter;
|
||||
|
||||
EventCount(MaxSizeVector<Waiter>& waiters) : waiters_(waiters) {
|
||||
eigen_assert(waiters.size() < (1 << kWaiterBits) - 1);
|
||||
// Initialize epoch to something close to overflow to test overflow.
|
||||
state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2);
|
||||
}
|
||||
|
||||
~EventCount() {
|
||||
// Ensure there are no waiters.
|
||||
eigen_plain_assert((state_.load() & (kStackMask | kWaiterMask)) == kStackMask);
|
||||
}
|
||||
|
||||
// Prewait prepares for waiting.
|
||||
// After calling this function the thread must re-check the wait predicate
|
||||
// and call either CancelWait or CommitWait passing the same Waiter object.
|
||||
void Prewait(Waiter* w) {
|
||||
w->epoch = state_.fetch_add(kWaiterInc, std::memory_order_relaxed);
|
||||
std::atomic_thread_fence(std::memory_order_seq_cst);
|
||||
}
|
||||
|
||||
// CommitWait commits waiting.
|
||||
void CommitWait(Waiter* w) {
|
||||
w->state = Waiter::kNotSignaled;
|
||||
// Modification epoch of this waiter.
|
||||
uint64_t epoch =
|
||||
(w->epoch & kEpochMask) +
|
||||
(((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
|
||||
uint64_t state = state_.load(std::memory_order_seq_cst);
|
||||
for (;;) {
|
||||
if (int64_t((state & kEpochMask) - epoch) < 0) {
|
||||
// The preceeding waiter has not decided on its fate. Wait until it
|
||||
// calls either CancelWait or CommitWait, or is notified.
|
||||
EIGEN_THREAD_YIELD();
|
||||
state = state_.load(std::memory_order_seq_cst);
|
||||
continue;
|
||||
}
|
||||
// We've already been notified.
|
||||
if (int64_t((state & kEpochMask) - epoch) > 0) return;
|
||||
// Remove this thread from prewait counter and add it to the waiter list.
|
||||
eigen_assert((state & kWaiterMask) != 0);
|
||||
uint64_t newstate = state - kWaiterInc + kEpochInc;
|
||||
newstate = (newstate & ~kStackMask) | (w - &waiters_[0]);
|
||||
if ((state & kStackMask) == kStackMask)
|
||||
w->next.store(nullptr, std::memory_order_relaxed);
|
||||
else
|
||||
w->next.store(&waiters_[state & kStackMask], std::memory_order_relaxed);
|
||||
if (state_.compare_exchange_weak(state, newstate,
|
||||
std::memory_order_release))
|
||||
break;
|
||||
}
|
||||
Park(w);
|
||||
}
|
||||
|
||||
// CancelWait cancels effects of the previous Prewait call.
|
||||
void CancelWait(Waiter* w) {
|
||||
uint64_t epoch =
|
||||
(w->epoch & kEpochMask) +
|
||||
(((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
|
||||
uint64_t state = state_.load(std::memory_order_relaxed);
|
||||
for (;;) {
|
||||
if (int64_t((state & kEpochMask) - epoch) < 0) {
|
||||
// The preceeding waiter has not decided on its fate. Wait until it
|
||||
// calls either CancelWait or CommitWait, or is notified.
|
||||
EIGEN_THREAD_YIELD();
|
||||
state = state_.load(std::memory_order_relaxed);
|
||||
continue;
|
||||
}
|
||||
// We've already been notified.
|
||||
if (int64_t((state & kEpochMask) - epoch) > 0) return;
|
||||
// Remove this thread from prewait counter.
|
||||
eigen_assert((state & kWaiterMask) != 0);
|
||||
if (state_.compare_exchange_weak(state, state - kWaiterInc + kEpochInc,
|
||||
std::memory_order_relaxed))
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Notify wakes one or all waiting threads.
|
||||
// Must be called after changing the associated wait predicate.
|
||||
void Notify(bool all) {
|
||||
std::atomic_thread_fence(std::memory_order_seq_cst);
|
||||
uint64_t state = state_.load(std::memory_order_acquire);
|
||||
for (;;) {
|
||||
// Easy case: no waiters.
|
||||
if ((state & kStackMask) == kStackMask && (state & kWaiterMask) == 0)
|
||||
return;
|
||||
uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
|
||||
uint64_t newstate;
|
||||
if (all) {
|
||||
// Reset prewait counter and empty wait list.
|
||||
newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask;
|
||||
} else if (waiters) {
|
||||
// There is a thread in pre-wait state, unblock it.
|
||||
newstate = state + kEpochInc - kWaiterInc;
|
||||
} else {
|
||||
// Pop a waiter from list and unpark it.
|
||||
Waiter* w = &waiters_[state & kStackMask];
|
||||
Waiter* wnext = w->next.load(std::memory_order_relaxed);
|
||||
uint64_t next = kStackMask;
|
||||
if (wnext != nullptr) next = wnext - &waiters_[0];
|
||||
// Note: we don't add kEpochInc here. ABA problem on the lock-free stack
|
||||
// can't happen because a waiter is re-pushed onto the stack only after
|
||||
// it was in the pre-wait state which inevitably leads to epoch
|
||||
// increment.
|
||||
newstate = (state & kEpochMask) + next;
|
||||
}
|
||||
if (state_.compare_exchange_weak(state, newstate,
|
||||
std::memory_order_acquire)) {
|
||||
if (!all && waiters) return; // unblocked pre-wait thread
|
||||
if ((state & kStackMask) == kStackMask) return;
|
||||
Waiter* w = &waiters_[state & kStackMask];
|
||||
if (!all) w->next.store(nullptr, std::memory_order_relaxed);
|
||||
Unpark(w);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class Waiter {
|
||||
friend class EventCount;
|
||||
// Align to 128 byte boundary to prevent false sharing with other Waiter objects in the same vector.
|
||||
EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<Waiter*> next;
|
||||
std::mutex mu;
|
||||
std::condition_variable cv;
|
||||
uint64_t epoch;
|
||||
unsigned state;
|
||||
enum {
|
||||
kNotSignaled,
|
||||
kWaiting,
|
||||
kSignaled,
|
||||
};
|
||||
};
|
||||
|
||||
private:
|
||||
// State_ layout:
|
||||
// - low kStackBits is a stack of waiters committed wait.
|
||||
// - next kWaiterBits is count of waiters in prewait state.
|
||||
// - next kEpochBits is modification counter.
|
||||
static const uint64_t kStackBits = 16;
|
||||
static const uint64_t kStackMask = (1ull << kStackBits) - 1;
|
||||
static const uint64_t kWaiterBits = 16;
|
||||
static const uint64_t kWaiterShift = 16;
|
||||
static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1)
|
||||
<< kWaiterShift;
|
||||
static const uint64_t kWaiterInc = 1ull << kWaiterBits;
|
||||
static const uint64_t kEpochBits = 32;
|
||||
static const uint64_t kEpochShift = 32;
|
||||
static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
|
||||
static const uint64_t kEpochInc = 1ull << kEpochShift;
|
||||
std::atomic<uint64_t> state_;
|
||||
MaxSizeVector<Waiter>& waiters_;
|
||||
|
||||
void Park(Waiter* w) {
|
||||
std::unique_lock<std::mutex> lock(w->mu);
|
||||
while (w->state != Waiter::kSignaled) {
|
||||
w->state = Waiter::kWaiting;
|
||||
w->cv.wait(lock);
|
||||
}
|
||||
}
|
||||
|
||||
void Unpark(Waiter* waiters) {
|
||||
Waiter* next = nullptr;
|
||||
for (Waiter* w = waiters; w; w = next) {
|
||||
next = w->next.load(std::memory_order_relaxed);
|
||||
unsigned state;
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(w->mu);
|
||||
state = w->state;
|
||||
w->state = Waiter::kSignaled;
|
||||
}
|
||||
// Avoid notifying if it wasn't waiting.
|
||||
if (state == Waiter::kWaiting) w->cv.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
EventCount(const EventCount&) = delete;
|
||||
void operator=(const EventCount&) = delete;
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
|
|
@ -0,0 +1,274 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
|
||||
#define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
|
||||
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
template <typename Environment>
|
||||
class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
|
||||
public:
|
||||
typedef typename Environment::Task Task;
|
||||
typedef RunQueue<Task, 1024> Queue;
|
||||
|
||||
NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
|
||||
: env_(env),
|
||||
threads_(num_threads),
|
||||
queues_(num_threads),
|
||||
coprimes_(num_threads),
|
||||
waiters_(num_threads),
|
||||
blocked_(0),
|
||||
spinning_(0),
|
||||
done_(false),
|
||||
ec_(waiters_) {
|
||||
waiters_.resize(num_threads);
|
||||
|
||||
// Calculate coprimes of num_threads.
|
||||
// Coprimes are used for a random walk over all threads in Steal
|
||||
// and NonEmptyQueueIndex. Iteration is based on the fact that if we take
|
||||
// a walk starting thread index t and calculate num_threads - 1 subsequent
|
||||
// indices as (t + coprime) % num_threads, we will cover all threads without
|
||||
// repetitions (effectively getting a presudo-random permutation of thread
|
||||
// indices).
|
||||
for (int i = 1; i <= num_threads; i++) {
|
||||
unsigned a = i;
|
||||
unsigned b = num_threads;
|
||||
// If GCD(a, b) == 1, then a and b are coprimes.
|
||||
while (b != 0) {
|
||||
unsigned tmp = a;
|
||||
a = b;
|
||||
b = tmp % b;
|
||||
}
|
||||
if (a == 1) {
|
||||
coprimes_.push_back(i);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < num_threads; i++) {
|
||||
queues_.push_back(new Queue());
|
||||
}
|
||||
for (int i = 0; i < num_threads; i++) {
|
||||
threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
|
||||
}
|
||||
}
|
||||
|
||||
~NonBlockingThreadPoolTempl() {
|
||||
done_ = true;
|
||||
// Now if all threads block without work, they will start exiting.
|
||||
// But note that threads can continue to work arbitrary long,
|
||||
// block, submit new work, unblock and otherwise live full life.
|
||||
ec_.Notify(true);
|
||||
|
||||
// Join threads explicitly to avoid destruction order issues.
|
||||
for (size_t i = 0; i < threads_.size(); i++) delete threads_[i];
|
||||
for (size_t i = 0; i < threads_.size(); i++) delete queues_[i];
|
||||
}
|
||||
|
||||
void Schedule(std::function<void()> fn) {
|
||||
Task t = env_.CreateTask(std::move(fn));
|
||||
PerThread* pt = GetPerThread();
|
||||
if (pt->pool == this) {
|
||||
// Worker thread of this pool, push onto the thread's queue.
|
||||
Queue* q = queues_[pt->thread_id];
|
||||
t = q->PushFront(std::move(t));
|
||||
} else {
|
||||
// A free-standing thread (or worker of another pool), push onto a random
|
||||
// queue.
|
||||
Queue* q = queues_[Rand(&pt->rand) % queues_.size()];
|
||||
t = q->PushBack(std::move(t));
|
||||
}
|
||||
// Note: below we touch this after making w available to worker threads.
|
||||
// Strictly speaking, this can lead to a racy-use-after-free. Consider that
|
||||
// Schedule is called from a thread that is neither main thread nor a worker
|
||||
// thread of this pool. Then, execution of w directly or indirectly
|
||||
// completes overall computations, which in turn leads to destruction of
|
||||
// this. We expect that such scenario is prevented by program, that is,
|
||||
// this is kept alive while any threads can potentially be in Schedule.
|
||||
if (!t.f)
|
||||
ec_.Notify(false);
|
||||
else
|
||||
env_.ExecuteTask(t); // Push failed, execute directly.
|
||||
}
|
||||
|
||||
int NumThreads() const final {
|
||||
return static_cast<int>(threads_.size());
|
||||
}
|
||||
|
||||
int CurrentThreadId() const final {
|
||||
const PerThread* pt =
|
||||
const_cast<NonBlockingThreadPoolTempl*>(this)->GetPerThread();
|
||||
if (pt->pool == this) {
|
||||
return pt->thread_id;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
typedef typename Environment::EnvThread Thread;
|
||||
|
||||
struct PerThread {
|
||||
constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) { }
|
||||
NonBlockingThreadPoolTempl* pool; // Parent pool, or null for normal threads.
|
||||
uint64_t rand; // Random generator state.
|
||||
int thread_id; // Worker thread index in pool.
|
||||
};
|
||||
|
||||
Environment env_;
|
||||
MaxSizeVector<Thread*> threads_;
|
||||
MaxSizeVector<Queue*> queues_;
|
||||
MaxSizeVector<unsigned> coprimes_;
|
||||
MaxSizeVector<EventCount::Waiter> waiters_;
|
||||
std::atomic<unsigned> blocked_;
|
||||
std::atomic<bool> spinning_;
|
||||
std::atomic<bool> done_;
|
||||
EventCount ec_;
|
||||
|
||||
// Main worker thread loop.
|
||||
void WorkerLoop(int thread_id) {
|
||||
PerThread* pt = GetPerThread();
|
||||
pt->pool = this;
|
||||
pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
|
||||
pt->thread_id = thread_id;
|
||||
Queue* q = queues_[thread_id];
|
||||
EventCount::Waiter* waiter = &waiters_[thread_id];
|
||||
for (;;) {
|
||||
Task t = q->PopFront();
|
||||
if (!t.f) {
|
||||
t = Steal();
|
||||
if (!t.f) {
|
||||
// Leave one thread spinning. This reduces latency.
|
||||
// TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it.
|
||||
// Also, the time it takes to attempt to steal work 1000 times depends
|
||||
// on the size of the thread pool. However the speed at which the user
|
||||
// of the thread pool submit tasks is independent of the size of the
|
||||
// pool. Consider a time based limit instead.
|
||||
if (!spinning_ && !spinning_.exchange(true)) {
|
||||
for (int i = 0; i < 1000 && !t.f; i++) {
|
||||
t = Steal();
|
||||
}
|
||||
spinning_ = false;
|
||||
}
|
||||
if (!t.f) {
|
||||
if (!WaitForWork(waiter, &t)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (t.f) {
|
||||
env_.ExecuteTask(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Steal tries to steal work from other worker threads in best-effort manner.
|
||||
Task Steal() {
|
||||
PerThread* pt = GetPerThread();
|
||||
const size_t size = queues_.size();
|
||||
unsigned r = Rand(&pt->rand);
|
||||
unsigned inc = coprimes_[r % coprimes_.size()];
|
||||
unsigned victim = r % size;
|
||||
for (unsigned i = 0; i < size; i++) {
|
||||
Task t = queues_[victim]->PopBack();
|
||||
if (t.f) {
|
||||
return t;
|
||||
}
|
||||
victim += inc;
|
||||
if (victim >= size) {
|
||||
victim -= size;
|
||||
}
|
||||
}
|
||||
return Task();
|
||||
}
|
||||
|
||||
// WaitForWork blocks until new work is available (returns true), or if it is
|
||||
// time to exit (returns false). Can optionally return a task to execute in t
|
||||
// (in such case t.f != nullptr on return).
|
||||
bool WaitForWork(EventCount::Waiter* waiter, Task* t) {
|
||||
eigen_assert(!t->f);
|
||||
// We already did best-effort emptiness check in Steal, so prepare for
|
||||
// blocking.
|
||||
ec_.Prewait(waiter);
|
||||
// Now do a reliable emptiness check.
|
||||
int victim = NonEmptyQueueIndex();
|
||||
if (victim != -1) {
|
||||
ec_.CancelWait(waiter);
|
||||
*t = queues_[victim]->PopBack();
|
||||
return true;
|
||||
}
|
||||
// Number of blocked threads is used as termination condition.
|
||||
// If we are shutting down and all worker threads blocked without work,
|
||||
// that's we are done.
|
||||
blocked_++;
|
||||
if (done_ && blocked_ == threads_.size()) {
|
||||
ec_.CancelWait(waiter);
|
||||
// Almost done, but need to re-check queues.
|
||||
// Consider that all queues are empty and all worker threads are preempted
|
||||
// right after incrementing blocked_ above. Now a free-standing thread
|
||||
// submits work and calls destructor (which sets done_). If we don't
|
||||
// re-check queues, we will exit leaving the work unexecuted.
|
||||
if (NonEmptyQueueIndex() != -1) {
|
||||
// Note: we must not pop from queues before we decrement blocked_,
|
||||
// otherwise the following scenario is possible. Consider that instead
|
||||
// of checking for emptiness we popped the only element from queues.
|
||||
// Now other worker threads can start exiting, which is bad if the
|
||||
// work item submits other work. So we just check emptiness here,
|
||||
// which ensures that all worker threads exit at the same time.
|
||||
blocked_--;
|
||||
return true;
|
||||
}
|
||||
// Reached stable termination state.
|
||||
ec_.Notify(true);
|
||||
return false;
|
||||
}
|
||||
ec_.CommitWait(waiter);
|
||||
blocked_--;
|
||||
return true;
|
||||
}
|
||||
|
||||
int NonEmptyQueueIndex() {
|
||||
PerThread* pt = GetPerThread();
|
||||
const size_t size = queues_.size();
|
||||
unsigned r = Rand(&pt->rand);
|
||||
unsigned inc = coprimes_[r % coprimes_.size()];
|
||||
unsigned victim = r % size;
|
||||
for (unsigned i = 0; i < size; i++) {
|
||||
if (!queues_[victim]->Empty()) {
|
||||
return victim;
|
||||
}
|
||||
victim += inc;
|
||||
if (victim >= size) {
|
||||
victim -= size;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static EIGEN_STRONG_INLINE PerThread* GetPerThread() {
|
||||
EIGEN_THREAD_LOCAL PerThread per_thread_;
|
||||
PerThread* pt = &per_thread_;
|
||||
return pt;
|
||||
}
|
||||
|
||||
static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
|
||||
uint64_t current = *state;
|
||||
// Update the internal state
|
||||
*state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
|
||||
// Generate the random output (using the PCG-XSH-RS scheme)
|
||||
return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
|
||||
}
|
||||
};
|
||||
|
||||
typedef NonBlockingThreadPoolTempl<StlThreadEnvironment> NonBlockingThreadPool;
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
|
|
@ -0,0 +1,210 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
|
||||
#define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
|
||||
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// RunQueue is a fixed-size, partially non-blocking deque or Work items.
|
||||
// Operations on front of the queue must be done by a single thread (owner),
|
||||
// operations on back of the queue can be done by multiple threads concurrently.
|
||||
//
|
||||
// Algorithm outline:
|
||||
// All remote threads operating on the queue back are serialized by a mutex.
|
||||
// This ensures that at most two threads access state: owner and one remote
|
||||
// thread (Size aside). The algorithm ensures that the occupied region of the
|
||||
// underlying array is logically continuous (can wraparound, but no stray
|
||||
// occupied elements). Owner operates on one end of this region, remote thread
|
||||
// operates on the other end. Synchronization between these threads
|
||||
// (potential consumption of the last element and take up of the last empty
|
||||
// element) happens by means of state variable in each element. States are:
|
||||
// empty, busy (in process of insertion of removal) and ready. Threads claim
|
||||
// elements (empty->busy and ready->busy transitions) by means of a CAS
|
||||
// operation. The finishing transition (busy->empty and busy->ready) are done
|
||||
// with plain store as the element is exclusively owned by the current thread.
|
||||
//
|
||||
// Note: we could permit only pointers as elements, then we would not need
|
||||
// separate state variable as null/non-null pointer value would serve as state,
|
||||
// but that would require malloc/free per operation for large, complex values
|
||||
// (and this is designed to store std::function<()>).
|
||||
template <typename Work, unsigned kSize>
|
||||
class RunQueue {
|
||||
public:
|
||||
RunQueue() : front_(0), back_(0) {
|
||||
// require power-of-two for fast masking
|
||||
eigen_assert((kSize & (kSize - 1)) == 0);
|
||||
eigen_assert(kSize > 2); // why would you do this?
|
||||
eigen_assert(kSize <= (64 << 10)); // leave enough space for counter
|
||||
for (unsigned i = 0; i < kSize; i++)
|
||||
array_[i].state.store(kEmpty, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
~RunQueue() { eigen_plain_assert(Size() == 0); }
|
||||
|
||||
// PushFront inserts w at the beginning of the queue.
|
||||
// If queue is full returns w, otherwise returns default-constructed Work.
|
||||
Work PushFront(Work w) {
|
||||
unsigned front = front_.load(std::memory_order_relaxed);
|
||||
Elem* e = &array_[front & kMask];
|
||||
uint8_t s = e->state.load(std::memory_order_relaxed);
|
||||
if (s != kEmpty ||
|
||||
!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
|
||||
return w;
|
||||
front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
|
||||
e->w = std::move(w);
|
||||
e->state.store(kReady, std::memory_order_release);
|
||||
return Work();
|
||||
}
|
||||
|
||||
// PopFront removes and returns the first element in the queue.
|
||||
// If the queue was empty returns default-constructed Work.
|
||||
Work PopFront() {
|
||||
unsigned front = front_.load(std::memory_order_relaxed);
|
||||
Elem* e = &array_[(front - 1) & kMask];
|
||||
uint8_t s = e->state.load(std::memory_order_relaxed);
|
||||
if (s != kReady ||
|
||||
!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
|
||||
return Work();
|
||||
Work w = std::move(e->w);
|
||||
e->state.store(kEmpty, std::memory_order_release);
|
||||
front = ((front - 1) & kMask2) | (front & ~kMask2);
|
||||
front_.store(front, std::memory_order_relaxed);
|
||||
return w;
|
||||
}
|
||||
|
||||
// PushBack adds w at the end of the queue.
|
||||
// If queue is full returns w, otherwise returns default-constructed Work.
|
||||
Work PushBack(Work w) {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
unsigned back = back_.load(std::memory_order_relaxed);
|
||||
Elem* e = &array_[(back - 1) & kMask];
|
||||
uint8_t s = e->state.load(std::memory_order_relaxed);
|
||||
if (s != kEmpty ||
|
||||
!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
|
||||
return w;
|
||||
back = ((back - 1) & kMask2) | (back & ~kMask2);
|
||||
back_.store(back, std::memory_order_relaxed);
|
||||
e->w = std::move(w);
|
||||
e->state.store(kReady, std::memory_order_release);
|
||||
return Work();
|
||||
}
|
||||
|
||||
// PopBack removes and returns the last elements in the queue.
|
||||
// Can fail spuriously.
|
||||
Work PopBack() {
|
||||
if (Empty()) return Work();
|
||||
std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
|
||||
if (!lock) return Work();
|
||||
unsigned back = back_.load(std::memory_order_relaxed);
|
||||
Elem* e = &array_[back & kMask];
|
||||
uint8_t s = e->state.load(std::memory_order_relaxed);
|
||||
if (s != kReady ||
|
||||
!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
|
||||
return Work();
|
||||
Work w = std::move(e->w);
|
||||
e->state.store(kEmpty, std::memory_order_release);
|
||||
back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
|
||||
return w;
|
||||
}
|
||||
|
||||
// PopBackHalf removes and returns half last elements in the queue.
|
||||
// Returns number of elements removed. But can also fail spuriously.
|
||||
unsigned PopBackHalf(std::vector<Work>* result) {
|
||||
if (Empty()) return 0;
|
||||
std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
|
||||
if (!lock) return 0;
|
||||
unsigned back = back_.load(std::memory_order_relaxed);
|
||||
unsigned size = Size();
|
||||
unsigned mid = back;
|
||||
if (size > 1) mid = back + (size - 1) / 2;
|
||||
unsigned n = 0;
|
||||
unsigned start = 0;
|
||||
for (; static_cast<int>(mid - back) >= 0; mid--) {
|
||||
Elem* e = &array_[mid & kMask];
|
||||
uint8_t s = e->state.load(std::memory_order_relaxed);
|
||||
if (n == 0) {
|
||||
if (s != kReady ||
|
||||
!e->state.compare_exchange_strong(s, kBusy,
|
||||
std::memory_order_acquire))
|
||||
continue;
|
||||
start = mid;
|
||||
} else {
|
||||
// Note: no need to store temporal kBusy, we exclusively own these
|
||||
// elements.
|
||||
eigen_assert(s == kReady);
|
||||
}
|
||||
result->push_back(std::move(e->w));
|
||||
e->state.store(kEmpty, std::memory_order_release);
|
||||
n++;
|
||||
}
|
||||
if (n != 0)
|
||||
back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed);
|
||||
return n;
|
||||
}
|
||||
|
||||
// Size returns current queue size.
|
||||
// Can be called by any thread at any time.
|
||||
unsigned Size() const {
|
||||
// Emptiness plays critical role in thread pool blocking. So we go to great
|
||||
// effort to not produce false positives (claim non-empty queue as empty).
|
||||
for (;;) {
|
||||
// Capture a consistent snapshot of front/tail.
|
||||
unsigned front = front_.load(std::memory_order_acquire);
|
||||
unsigned back = back_.load(std::memory_order_acquire);
|
||||
unsigned front1 = front_.load(std::memory_order_relaxed);
|
||||
if (front != front1) continue;
|
||||
int size = (front & kMask2) - (back & kMask2);
|
||||
// Fix overflow.
|
||||
if (size < 0) size += 2 * kSize;
|
||||
// Order of modification in push/pop is crafted to make the queue look
|
||||
// larger than it is during concurrent modifications. E.g. pop can
|
||||
// decrement size before the corresponding push has incremented it.
|
||||
// So the computed size can be up to kSize + 1, fix it.
|
||||
if (size > static_cast<int>(kSize)) size = kSize;
|
||||
return size;
|
||||
}
|
||||
}
|
||||
|
||||
// Empty tests whether container is empty.
|
||||
// Can be called by any thread at any time.
|
||||
bool Empty() const { return Size() == 0; }
|
||||
|
||||
private:
|
||||
static const unsigned kMask = kSize - 1;
|
||||
static const unsigned kMask2 = (kSize << 1) - 1;
|
||||
struct Elem {
|
||||
std::atomic<uint8_t> state;
|
||||
Work w;
|
||||
};
|
||||
enum {
|
||||
kEmpty,
|
||||
kBusy,
|
||||
kReady,
|
||||
};
|
||||
std::mutex mutex_;
|
||||
// Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
|
||||
// front/back, repsectively. The remaining bits contain modification counters
|
||||
// that are incremented on Push operations. This allows us to (1) distinguish
|
||||
// between empty and full conditions (if we would use log(kSize) bits for
|
||||
// position, these conditions would be indistinguishable); (2) obtain
|
||||
// consistent snapshot of front_/back_ for Size operation using the
|
||||
// modification counters.
|
||||
std::atomic<unsigned> front_;
|
||||
std::atomic<unsigned> back_;
|
||||
Elem array_[kSize];
|
||||
|
||||
RunQueue(const RunQueue&) = delete;
|
||||
void operator=(const RunQueue&) = delete;
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
|
|
@ -0,0 +1,154 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
|
||||
#define EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// The implementation of the ThreadPool type ensures that the Schedule method
|
||||
// runs the functions it is provided in FIFO order when the scheduling is done
|
||||
// by a single thread.
|
||||
// Environment provides a way to create threads and also allows to intercept
|
||||
// task submission and execution.
|
||||
template <typename Environment>
|
||||
class SimpleThreadPoolTempl : public ThreadPoolInterface {
|
||||
public:
|
||||
// Construct a pool that contains "num_threads" threads.
|
||||
explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment())
|
||||
: env_(env), threads_(num_threads), waiters_(num_threads) {
|
||||
for (int i = 0; i < num_threads; i++) {
|
||||
threads_.push_back(env.CreateThread([this, i]() { WorkerLoop(i); }));
|
||||
}
|
||||
}
|
||||
|
||||
// Wait until all scheduled work has finished and then destroy the
|
||||
// set of threads.
|
||||
~SimpleThreadPoolTempl() {
|
||||
{
|
||||
// Wait for all work to get done.
|
||||
std::unique_lock<std::mutex> l(mu_);
|
||||
while (!pending_.empty()) {
|
||||
empty_.wait(l);
|
||||
}
|
||||
exiting_ = true;
|
||||
|
||||
// Wakeup all waiters.
|
||||
for (auto w : waiters_) {
|
||||
w->ready = true;
|
||||
w->task.f = nullptr;
|
||||
w->cv.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for threads to finish.
|
||||
for (auto t : threads_) {
|
||||
delete t;
|
||||
}
|
||||
}
|
||||
|
||||
// Schedule fn() for execution in the pool of threads. The functions are
|
||||
// executed in the order in which they are scheduled.
|
||||
void Schedule(std::function<void()> fn) final {
|
||||
Task t = env_.CreateTask(std::move(fn));
|
||||
std::unique_lock<std::mutex> l(mu_);
|
||||
if (waiters_.empty()) {
|
||||
pending_.push_back(std::move(t));
|
||||
} else {
|
||||
Waiter* w = waiters_.back();
|
||||
waiters_.pop_back();
|
||||
w->ready = true;
|
||||
w->task = std::move(t);
|
||||
w->cv.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
int NumThreads() const final {
|
||||
return static_cast<int>(threads_.size());
|
||||
}
|
||||
|
||||
int CurrentThreadId() const final {
|
||||
const PerThread* pt = this->GetPerThread();
|
||||
if (pt->pool == this) {
|
||||
return pt->thread_id;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
void WorkerLoop(int thread_id) {
|
||||
std::unique_lock<std::mutex> l(mu_);
|
||||
PerThread* pt = GetPerThread();
|
||||
pt->pool = this;
|
||||
pt->thread_id = thread_id;
|
||||
Waiter w;
|
||||
Task t;
|
||||
while (!exiting_) {
|
||||
if (pending_.empty()) {
|
||||
// Wait for work to be assigned to me
|
||||
w.ready = false;
|
||||
waiters_.push_back(&w);
|
||||
while (!w.ready) {
|
||||
w.cv.wait(l);
|
||||
}
|
||||
t = w.task;
|
||||
w.task.f = nullptr;
|
||||
} else {
|
||||
// Pick up pending work
|
||||
t = std::move(pending_.front());
|
||||
pending_.pop_front();
|
||||
if (pending_.empty()) {
|
||||
empty_.notify_all();
|
||||
}
|
||||
}
|
||||
if (t.f) {
|
||||
mu_.unlock();
|
||||
env_.ExecuteTask(t);
|
||||
t.f = nullptr;
|
||||
mu_.lock();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
typedef typename Environment::Task Task;
|
||||
typedef typename Environment::EnvThread Thread;
|
||||
|
||||
struct Waiter {
|
||||
std::condition_variable cv;
|
||||
Task task;
|
||||
bool ready;
|
||||
};
|
||||
|
||||
struct PerThread {
|
||||
constexpr PerThread() : pool(NULL), thread_id(-1) { }
|
||||
SimpleThreadPoolTempl* pool; // Parent pool, or null for normal threads.
|
||||
int thread_id; // Worker thread index in pool.
|
||||
};
|
||||
|
||||
Environment env_;
|
||||
std::mutex mu_;
|
||||
MaxSizeVector<Thread*> threads_; // All threads
|
||||
MaxSizeVector<Waiter*> waiters_; // Stack of waiting threads.
|
||||
std::deque<Task> pending_; // Queue of pending work
|
||||
std::condition_variable empty_; // Signaled on pending_.empty()
|
||||
bool exiting_ = false;
|
||||
|
||||
PerThread* GetPerThread() const {
|
||||
EIGEN_THREAD_LOCAL PerThread per_thread;
|
||||
return &per_thread;
|
||||
}
|
||||
};
|
||||
|
||||
typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool;
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
|
|
@ -0,0 +1,38 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
|
||||
#define EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
struct StlThreadEnvironment {
|
||||
struct Task {
|
||||
std::function<void()> f;
|
||||
};
|
||||
|
||||
// EnvThread constructor must start the thread,
|
||||
// destructor must join the thread.
|
||||
class EnvThread {
|
||||
public:
|
||||
EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
|
||||
~EnvThread() { thr_.join(); }
|
||||
|
||||
private:
|
||||
std::thread thr_;
|
||||
};
|
||||
|
||||
EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); }
|
||||
Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
|
||||
void ExecuteTask(const Task& t) { t.f(); }
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
|
|
@ -0,0 +1,22 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
|
||||
#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
|
||||
|
||||
// Try to come up with a portable implementation of thread local variables
|
||||
#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
|
||||
#define EIGEN_THREAD_LOCAL static __thread
|
||||
#elif EIGEN_COMP_CLANG
|
||||
#define EIGEN_THREAD_LOCAL static __thread
|
||||
#else
|
||||
#define EIGEN_THREAD_LOCAL static thread_local
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
|
|
@ -0,0 +1,33 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
|
||||
#define EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// This defines an interface that ThreadPoolDevice can take to use
|
||||
// custom thread pools underneath.
|
||||
class ThreadPoolInterface {
|
||||
public:
|
||||
virtual void Schedule(std::function<void()> fn) = 0;
|
||||
|
||||
// Returns the number of threads in the pool.
|
||||
virtual int NumThreads() const = 0;
|
||||
|
||||
// Returns a logical thread index between 0 and NumThreads() - 1 if called
|
||||
// from one of the threads in the pool. Returns -1 otherwise.
|
||||
virtual int CurrentThreadId() const = 0;
|
||||
|
||||
virtual ~ThreadPoolInterface() {}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
|
|
@ -0,0 +1,20 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
|
||||
#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
|
||||
|
||||
// Try to come up with a portable way to yield
|
||||
#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
|
||||
#define EIGEN_THREAD_YIELD() sched_yield()
|
||||
#else
|
||||
#define EIGEN_THREAD_YIELD() std::this_thread::yield()
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
|
|
@ -0,0 +1,542 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11META_H
|
||||
#define EIGEN_CXX11META_H
|
||||
|
||||
#include <vector>
|
||||
#include "EmulateArray.h"
|
||||
|
||||
// Emulate the cxx11 functionality that we need if the compiler doesn't support it.
|
||||
// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it
|
||||
// supports enough of the standard for our needs
|
||||
#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
|
||||
|
||||
#include "CXX11Workarounds.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
/** \internal
|
||||
* \file CXX11/util/CXX11Meta.h
|
||||
* This file contains generic metaprogramming classes which are not specifically related to Eigen.
|
||||
* This file expands upon Core/util/Meta.h and adds support for C++11 specific features.
|
||||
*/
|
||||
|
||||
template<typename... tt>
|
||||
struct type_list { constexpr static int count = sizeof...(tt); };
|
||||
|
||||
template<typename t, typename... tt>
|
||||
struct type_list<t, tt...> { constexpr static int count = sizeof...(tt) + 1; typedef t first_type; };
|
||||
|
||||
template<typename T, T... nn>
|
||||
struct numeric_list { constexpr static std::size_t count = sizeof...(nn); };
|
||||
|
||||
template<typename T, T n, T... nn>
|
||||
struct numeric_list<T, n, nn...> { constexpr static std::size_t count = sizeof...(nn) + 1; constexpr static T first_value = n; };
|
||||
|
||||
/* numeric list constructors
|
||||
*
|
||||
* equivalencies:
|
||||
* constructor result
|
||||
* typename gen_numeric_list<int, 5>::type numeric_list<int, 0,1,2,3,4>
|
||||
* typename gen_numeric_list_reversed<int, 5>::type numeric_list<int, 4,3,2,1,0>
|
||||
* typename gen_numeric_list_swapped_pair<int, 5,1,2>::type numeric_list<int, 0,2,1,3,4>
|
||||
* typename gen_numeric_list_repeated<int, 0, 5>::type numeric_list<int, 0,0,0,0,0>
|
||||
*/
|
||||
|
||||
template<typename T, std::size_t n, T start = 0, T... ii> struct gen_numeric_list : gen_numeric_list<T, n-1, start, start + n-1, ii...> {};
|
||||
template<typename T, T start, T... ii> struct gen_numeric_list<T, 0, start, ii...> { typedef numeric_list<T, ii...> type; };
|
||||
|
||||
template<typename T, std::size_t n, T start = 0, T... ii> struct gen_numeric_list_reversed : gen_numeric_list_reversed<T, n-1, start, ii..., start + n-1> {};
|
||||
template<typename T, T start, T... ii> struct gen_numeric_list_reversed<T, 0, start, ii...> { typedef numeric_list<T, ii...> type; };
|
||||
|
||||
template<typename T, std::size_t n, T a, T b, T start = 0, T... ii> struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair<T, n-1, a, b, start, (start + n-1) == a ? b : ((start + n-1) == b ? a : (start + n-1)), ii...> {};
|
||||
template<typename T, T a, T b, T start, T... ii> struct gen_numeric_list_swapped_pair<T, 0, a, b, start, ii...> { typedef numeric_list<T, ii...> type; };
|
||||
|
||||
template<typename T, std::size_t n, T V, T... nn> struct gen_numeric_list_repeated : gen_numeric_list_repeated<T, n-1, V, V, nn...> {};
|
||||
template<typename T, T V, T... nn> struct gen_numeric_list_repeated<T, 0, V, nn...> { typedef numeric_list<T, nn...> type; };
|
||||
|
||||
/* list manipulation: concatenate */
|
||||
|
||||
template<class a, class b> struct concat;
|
||||
|
||||
template<typename... as, typename... bs> struct concat<type_list<as...>, type_list<bs...>> { typedef type_list<as..., bs...> type; };
|
||||
template<typename T, T... as, T... bs> struct concat<numeric_list<T, as...>, numeric_list<T, bs...> > { typedef numeric_list<T, as..., bs...> type; };
|
||||
|
||||
template<typename... p> struct mconcat;
|
||||
template<typename a> struct mconcat<a> { typedef a type; };
|
||||
template<typename a, typename b> struct mconcat<a, b> : concat<a, b> {};
|
||||
template<typename a, typename b, typename... cs> struct mconcat<a, b, cs...> : concat<a, typename mconcat<b, cs...>::type> {};
|
||||
|
||||
/* list manipulation: extract slices */
|
||||
|
||||
template<int n, typename x> struct take;
|
||||
template<int n, typename a, typename... as> struct take<n, type_list<a, as...>> : concat<type_list<a>, typename take<n-1, type_list<as...>>::type> {};
|
||||
template<int n> struct take<n, type_list<>> { typedef type_list<> type; };
|
||||
template<typename a, typename... as> struct take<0, type_list<a, as...>> { typedef type_list<> type; };
|
||||
template<> struct take<0, type_list<>> { typedef type_list<> type; };
|
||||
|
||||
template<typename T, int n, T a, T... as> struct take<n, numeric_list<T, a, as...>> : concat<numeric_list<T, a>, typename take<n-1, numeric_list<T, as...>>::type> {};
|
||||
template<typename T, int n> struct take<n, numeric_list<T>> { typedef numeric_list<T> type; };
|
||||
template<typename T, T a, T... as> struct take<0, numeric_list<T, a, as...>> { typedef numeric_list<T> type; };
|
||||
template<typename T> struct take<0, numeric_list<T>> { typedef numeric_list<T> type; };
|
||||
|
||||
template<typename T, int n, T... ii> struct h_skip_helper_numeric;
|
||||
template<typename T, int n, T i, T... ii> struct h_skip_helper_numeric<T, n, i, ii...> : h_skip_helper_numeric<T, n-1, ii...> {};
|
||||
template<typename T, T i, T... ii> struct h_skip_helper_numeric<T, 0, i, ii...> { typedef numeric_list<T, i, ii...> type; };
|
||||
template<typename T, int n> struct h_skip_helper_numeric<T, n> { typedef numeric_list<T> type; };
|
||||
template<typename T> struct h_skip_helper_numeric<T, 0> { typedef numeric_list<T> type; };
|
||||
|
||||
template<int n, typename... tt> struct h_skip_helper_type;
|
||||
template<int n, typename t, typename... tt> struct h_skip_helper_type<n, t, tt...> : h_skip_helper_type<n-1, tt...> {};
|
||||
template<typename t, typename... tt> struct h_skip_helper_type<0, t, tt...> { typedef type_list<t, tt...> type; };
|
||||
template<int n> struct h_skip_helper_type<n> { typedef type_list<> type; };
|
||||
template<> struct h_skip_helper_type<0> { typedef type_list<> type; };
|
||||
|
||||
template<int n>
|
||||
struct h_skip {
|
||||
template<typename T, T... ii>
|
||||
constexpr static inline typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); }
|
||||
template<typename... tt>
|
||||
constexpr static inline typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); }
|
||||
};
|
||||
|
||||
template<int n, typename a> struct skip { typedef decltype(h_skip<n>::helper(a())) type; };
|
||||
|
||||
template<int start, int count, typename a> struct slice : take<count, typename skip<start, a>::type> {};
|
||||
|
||||
/* list manipulation: retrieve single element from list */
|
||||
|
||||
template<int n, typename x> struct get;
|
||||
|
||||
template<int n, typename a, typename... as> struct get<n, type_list<a, as...>> : get<n-1, type_list<as...>> {};
|
||||
template<typename a, typename... as> struct get<0, type_list<a, as...>> { typedef a type; };
|
||||
|
||||
template<typename T, int n, T a, T... as> struct get<n, numeric_list<T, a, as...>> : get<n-1, numeric_list<T, as...>> {};
|
||||
template<typename T, T a, T... as> struct get<0, numeric_list<T, a, as...>> { constexpr static T value = a; };
|
||||
|
||||
/* always get type, regardless of dummy; good for parameter pack expansion */
|
||||
|
||||
template<typename T, T dummy, typename t> struct id_numeric { typedef t type; };
|
||||
template<typename dummy, typename t> struct id_type { typedef t type; };
|
||||
|
||||
/* equality checking, flagged version */
|
||||
|
||||
template<typename a, typename b> struct is_same_gf : is_same<a, b> { constexpr static int global_flags = 0; };
|
||||
|
||||
/* apply_op to list */
|
||||
|
||||
template<
|
||||
bool from_left, // false
|
||||
template<typename, typename> class op,
|
||||
typename additional_param,
|
||||
typename... values
|
||||
>
|
||||
struct h_apply_op_helper { typedef type_list<typename op<values, additional_param>::type...> type; };
|
||||
template<
|
||||
template<typename, typename> class op,
|
||||
typename additional_param,
|
||||
typename... values
|
||||
>
|
||||
struct h_apply_op_helper<true, op, additional_param, values...> { typedef type_list<typename op<additional_param, values>::type...> type; };
|
||||
|
||||
template<
|
||||
bool from_left,
|
||||
template<typename, typename> class op,
|
||||
typename additional_param
|
||||
>
|
||||
struct h_apply_op
|
||||
{
|
||||
template<typename... values>
|
||||
constexpr static typename h_apply_op_helper<from_left, op, additional_param, values...>::type helper(type_list<values...>)
|
||||
{ return typename h_apply_op_helper<from_left, op, additional_param, values...>::type(); }
|
||||
};
|
||||
|
||||
template<
|
||||
template<typename, typename> class op,
|
||||
typename additional_param,
|
||||
typename a
|
||||
>
|
||||
struct apply_op_from_left { typedef decltype(h_apply_op<true, op, additional_param>::helper(a())) type; };
|
||||
|
||||
template<
|
||||
template<typename, typename> class op,
|
||||
typename additional_param,
|
||||
typename a
|
||||
>
|
||||
struct apply_op_from_right { typedef decltype(h_apply_op<false, op, additional_param>::helper(a())) type; };
|
||||
|
||||
/* see if an element is in a list */
|
||||
|
||||
template<
|
||||
template<typename, typename> class test,
|
||||
typename check_against,
|
||||
typename h_list,
|
||||
bool last_check_positive = false
|
||||
>
|
||||
struct contained_in_list;
|
||||
|
||||
template<
|
||||
template<typename, typename> class test,
|
||||
typename check_against,
|
||||
typename h_list
|
||||
>
|
||||
struct contained_in_list<test, check_against, h_list, true>
|
||||
{
|
||||
constexpr static bool value = true;
|
||||
};
|
||||
|
||||
template<
|
||||
template<typename, typename> class test,
|
||||
typename check_against,
|
||||
typename a,
|
||||
typename... as
|
||||
>
|
||||
struct contained_in_list<test, check_against, type_list<a, as...>, false> : contained_in_list<test, check_against, type_list<as...>, test<check_against, a>::value> {};
|
||||
|
||||
template<
|
||||
template<typename, typename> class test,
|
||||
typename check_against
|
||||
EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty)
|
||||
>
|
||||
struct contained_in_list<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, false> { constexpr static bool value = false; };
|
||||
|
||||
/* see if an element is in a list and check for global flags */
|
||||
|
||||
template<
|
||||
template<typename, typename> class test,
|
||||
typename check_against,
|
||||
typename h_list,
|
||||
int default_flags = 0,
|
||||
bool last_check_positive = false,
|
||||
int last_check_flags = default_flags
|
||||
>
|
||||
struct contained_in_list_gf;
|
||||
|
||||
template<
|
||||
template<typename, typename> class test,
|
||||
typename check_against,
|
||||
typename h_list,
|
||||
int default_flags,
|
||||
int last_check_flags
|
||||
>
|
||||
struct contained_in_list_gf<test, check_against, h_list, default_flags, true, last_check_flags>
|
||||
{
|
||||
constexpr static bool value = true;
|
||||
constexpr static int global_flags = last_check_flags;
|
||||
};
|
||||
|
||||
template<
|
||||
template<typename, typename> class test,
|
||||
typename check_against,
|
||||
typename a,
|
||||
typename... as,
|
||||
int default_flags,
|
||||
int last_check_flags
|
||||
>
|
||||
struct contained_in_list_gf<test, check_against, type_list<a, as...>, default_flags, false, last_check_flags> : contained_in_list_gf<test, check_against, type_list<as...>, default_flags, test<check_against, a>::value, test<check_against, a>::global_flags> {};
|
||||
|
||||
template<
|
||||
template<typename, typename> class test,
|
||||
typename check_against
|
||||
EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty),
|
||||
int default_flags,
|
||||
int last_check_flags
|
||||
>
|
||||
struct contained_in_list_gf<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, default_flags, false, last_check_flags> { constexpr static bool value = false; constexpr static int global_flags = default_flags; };
|
||||
|
||||
/* generic reductions */
|
||||
|
||||
template<
|
||||
typename Reducer,
|
||||
typename... Ts
|
||||
> struct reduce;
|
||||
|
||||
template<
|
||||
typename Reducer
|
||||
> struct reduce<Reducer>
|
||||
{
|
||||
constexpr static inline int run() { return Reducer::Identity; }
|
||||
};
|
||||
|
||||
template<
|
||||
typename Reducer,
|
||||
typename A
|
||||
> struct reduce<Reducer, A>
|
||||
{
|
||||
constexpr static inline A run(A a) { return a; }
|
||||
};
|
||||
|
||||
template<
|
||||
typename Reducer,
|
||||
typename A,
|
||||
typename... Ts
|
||||
> struct reduce<Reducer, A, Ts...>
|
||||
{
|
||||
constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
|
||||
return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
|
||||
}
|
||||
};
|
||||
|
||||
/* generic binary operations */
|
||||
|
||||
struct sum_op {
|
||||
template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a + b) { return a + b; }
|
||||
static constexpr int Identity = 0;
|
||||
};
|
||||
struct product_op {
|
||||
template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a * b) { return a * b; }
|
||||
static constexpr int Identity = 1;
|
||||
};
|
||||
|
||||
struct logical_and_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a && b) { return a && b; } };
|
||||
struct logical_or_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a || b) { return a || b; } };
|
||||
|
||||
struct equal_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a == b) { return a == b; } };
|
||||
struct not_equal_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a != b) { return a != b; } };
|
||||
struct lesser_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a < b) { return a < b; } };
|
||||
struct lesser_equal_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a <= b) { return a <= b; } };
|
||||
struct greater_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a > b) { return a > b; } };
|
||||
struct greater_equal_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a >= b) { return a >= b; } };
|
||||
|
||||
/* generic unary operations */
|
||||
|
||||
struct not_op { template<typename A> constexpr static inline auto run(A a) -> decltype(!a) { return !a; } };
|
||||
struct negation_op { template<typename A> constexpr static inline auto run(A a) -> decltype(-a) { return -a; } };
|
||||
struct greater_equal_zero_op { template<typename A> constexpr static inline auto run(A a) -> decltype(a >= 0) { return a >= 0; } };
|
||||
|
||||
|
||||
/* reductions for lists */
|
||||
|
||||
// using auto -> return value spec makes ICC 13.0 and 13.1 crash here, so we have to hack it
|
||||
// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
|
||||
// does...
|
||||
template<typename... Ts>
|
||||
constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
|
||||
{
|
||||
return reduce<product_op, Ts...>::run(ts...);
|
||||
}
|
||||
|
||||
template<typename... Ts>
|
||||
constexpr inline decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts)
|
||||
{
|
||||
return reduce<sum_op, Ts...>::run(ts...);
|
||||
}
|
||||
|
||||
/* reverse arrays */
|
||||
|
||||
template<typename Array, int... n>
|
||||
constexpr inline Array h_array_reverse(Array arr, numeric_list<int, n...>)
|
||||
{
|
||||
return {{array_get<sizeof...(n) - n - 1>(arr)...}};
|
||||
}
|
||||
|
||||
template<typename T, std::size_t N>
|
||||
constexpr inline array<T, N> array_reverse(array<T, N> arr)
|
||||
{
|
||||
return h_array_reverse(arr, typename gen_numeric_list<int, N>::type());
|
||||
}
|
||||
|
||||
|
||||
/* generic array reductions */
|
||||
|
||||
// can't reuse standard reduce() interface above because Intel's Compiler
|
||||
// *really* doesn't like it, so we just reimplement the stuff
|
||||
// (start from N - 1 and work down to 0 because specialization for
|
||||
// n == N - 1 also doesn't work in Intel's compiler, so it goes into
|
||||
// an infinite loop)
|
||||
template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
|
||||
struct h_array_reduce {
|
||||
EIGEN_DEVICE_FUNC constexpr static inline auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)))
|
||||
{
|
||||
return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Reducer, typename T, std::size_t N>
|
||||
struct h_array_reduce<Reducer, T, N, 0>
|
||||
{
|
||||
EIGEN_DEVICE_FUNC constexpr static inline T run(const array<T, N>& arr, T)
|
||||
{
|
||||
return array_get<0>(arr);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Reducer, typename T>
|
||||
struct h_array_reduce<Reducer, T, 0>
|
||||
{
|
||||
EIGEN_DEVICE_FUNC constexpr static inline T run(const array<T, 0>&, T identity)
|
||||
{
|
||||
return identity;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Reducer, typename T, std::size_t N>
|
||||
EIGEN_DEVICE_FUNC constexpr inline auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity))
|
||||
{
|
||||
return h_array_reduce<Reducer, T, N>::run(arr, identity);
|
||||
}
|
||||
|
||||
/* standard array reductions */
|
||||
|
||||
template<typename T, std::size_t N>
|
||||
EIGEN_DEVICE_FUNC constexpr inline auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0)))
|
||||
{
|
||||
return array_reduce<sum_op, T, N>(arr, static_cast<T>(0));
|
||||
}
|
||||
|
||||
template<typename T, std::size_t N>
|
||||
EIGEN_DEVICE_FUNC constexpr inline auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1)))
|
||||
{
|
||||
return array_reduce<product_op, T, N>(arr, static_cast<T>(1));
|
||||
}
|
||||
|
||||
template<typename t>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
|
||||
eigen_assert(a.size() > 0);
|
||||
t prod = 1;
|
||||
for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
|
||||
return prod;
|
||||
}
|
||||
|
||||
/* zip an array */
|
||||
|
||||
template<typename Op, typename A, typename B, std::size_t N, int... n>
|
||||
constexpr inline array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
|
||||
{
|
||||
return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
|
||||
}
|
||||
|
||||
template<typename Op, typename A, typename B, std::size_t N>
|
||||
constexpr inline array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b)
|
||||
{
|
||||
return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type());
|
||||
}
|
||||
|
||||
/* zip an array and reduce the result */
|
||||
|
||||
template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
|
||||
constexpr inline auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
|
||||
{
|
||||
return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...);
|
||||
}
|
||||
|
||||
template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
|
||||
constexpr inline auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
|
||||
{
|
||||
return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type());
|
||||
}
|
||||
|
||||
/* apply stuff to an array */
|
||||
|
||||
template<typename Op, typename A, std::size_t N, int... n>
|
||||
constexpr inline array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
|
||||
{
|
||||
return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
|
||||
}
|
||||
|
||||
template<typename Op, typename A, std::size_t N>
|
||||
constexpr inline array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
|
||||
{
|
||||
return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type());
|
||||
}
|
||||
|
||||
/* apply stuff to an array and reduce */
|
||||
|
||||
template<typename Reducer, typename Op, typename A, std::size_t N, int... n>
|
||||
constexpr inline auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
|
||||
{
|
||||
return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...);
|
||||
}
|
||||
|
||||
template<typename Reducer, typename Op, typename A, std::size_t N>
|
||||
constexpr inline auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
|
||||
{
|
||||
return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type());
|
||||
}
|
||||
|
||||
/* repeat a value n times (and make an array out of it
|
||||
* usage:
|
||||
* array<int, 16> = repeat<16>(42);
|
||||
*/
|
||||
|
||||
template<int n>
|
||||
struct h_repeat
|
||||
{
|
||||
template<typename t, int... ii>
|
||||
constexpr static inline array<t, n> run(t v, numeric_list<int, ii...>)
|
||||
{
|
||||
return {{ typename id_numeric<int, ii, t>::type(v)... }};
|
||||
}
|
||||
};
|
||||
|
||||
template<int n, typename t>
|
||||
constexpr array<t, n> repeat(t v) { return h_repeat<n>::run(v, typename gen_numeric_list<int, n>::type()); }
|
||||
|
||||
/* instantiate a class by a C-style array */
|
||||
template<class InstType, typename ArrType, std::size_t N, bool Reverse, typename... Ps>
|
||||
struct h_instantiate_by_c_array;
|
||||
|
||||
template<class InstType, typename ArrType, std::size_t N, typename... Ps>
|
||||
struct h_instantiate_by_c_array<InstType, ArrType, N, false, Ps...>
|
||||
{
|
||||
static InstType run(ArrType* arr, Ps... args)
|
||||
{
|
||||
return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, Ps..., ArrType>::run(arr + 1, args..., arr[0]);
|
||||
}
|
||||
};
|
||||
|
||||
template<class InstType, typename ArrType, std::size_t N, typename... Ps>
|
||||
struct h_instantiate_by_c_array<InstType, ArrType, N, true, Ps...>
|
||||
{
|
||||
static InstType run(ArrType* arr, Ps... args)
|
||||
{
|
||||
return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, ArrType, Ps...>::run(arr + 1, arr[0], args...);
|
||||
}
|
||||
};
|
||||
|
||||
template<class InstType, typename ArrType, typename... Ps>
|
||||
struct h_instantiate_by_c_array<InstType, ArrType, 0, false, Ps...>
|
||||
{
|
||||
static InstType run(ArrType* arr, Ps... args)
|
||||
{
|
||||
(void)arr;
|
||||
return InstType(args...);
|
||||
}
|
||||
};
|
||||
|
||||
template<class InstType, typename ArrType, typename... Ps>
|
||||
struct h_instantiate_by_c_array<InstType, ArrType, 0, true, Ps...>
|
||||
{
|
||||
static InstType run(ArrType* arr, Ps... args)
|
||||
{
|
||||
(void)arr;
|
||||
return InstType(args...);
|
||||
}
|
||||
};
|
||||
|
||||
template<class InstType, typename ArrType, std::size_t N, bool Reverse = false>
|
||||
InstType instantiate_by_c_array(ArrType* arr)
|
||||
{
|
||||
return h_instantiate_by_c_array<InstType, ArrType, N, Reverse>::run(arr);
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#else // Non C++11, fallback to emulation mode
|
||||
|
||||
#include "EmulateCXX11Meta.h"
|
||||
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_CXX11META_H
|
|
@ -0,0 +1,88 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_CXX11WORKAROUNDS_H
|
||||
#define EIGEN_CXX11WORKAROUNDS_H
|
||||
|
||||
/* COMPATIBILITY CHECKS
|
||||
* (so users of compilers that are too old get some realistic error messages)
|
||||
*/
|
||||
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1310)
|
||||
#error Intel Compiler only supports required C++ features since version 13.1.
|
||||
// note that most stuff in principle works with 13.0 but when combining
|
||||
// some features, at some point 13.0 will just fail with an internal assertion
|
||||
#elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6))
|
||||
// G++ < 4.6 by default will continue processing the source files - even if we use #error to make
|
||||
// it error out. For this reason, we use the pragma to make sure G++ aborts at the first error
|
||||
// it sees. Unfortunately, that is still not our #error directive, but at least the output is
|
||||
// short enough the user has a chance to see that the compiler version is not sufficient for
|
||||
// the funky template mojo we use.
|
||||
#pragma GCC diagnostic error "-Wfatal-errors"
|
||||
#error GNU C++ Compiler (g++) only supports required C++ features since version 4.6.
|
||||
#endif
|
||||
|
||||
/* Check that the compiler at least claims to support C++11. It might not be sufficient
|
||||
* because the compiler may not implement it correctly, but at least we'll know.
|
||||
* On the other hand, visual studio still doesn't claim to support C++11 although it's
|
||||
* compliant enugh for our purpose.
|
||||
*/
|
||||
#if (__cplusplus <= 199711L) && (EIGEN_COMP_MSVC < 1900)
|
||||
#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
|
||||
#pragma GCC diagnostic error "-Wfatal-errors"
|
||||
#endif
|
||||
#error This library needs at least a C++11 compliant compiler. If you use g++/clang, please enable the -std=c++11 compiler flag. (-std=c++0x on older versions.)
|
||||
#endif
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
/* std::get is only constexpr in C++14, not yet in C++11
|
||||
*/
|
||||
|
||||
|
||||
template<std::size_t I, class T> constexpr inline T& array_get(std::vector<T>& a) { return a[I]; }
|
||||
template<std::size_t I, class T> constexpr inline T&& array_get(std::vector<T>&& a) { return a[I]; }
|
||||
template<std::size_t I, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I]; }
|
||||
|
||||
/* Suppose you have a template of the form
|
||||
* template<typename T> struct X;
|
||||
* And you want to specialize it in such a way:
|
||||
* template<typename S1, typename... SN> struct X<Foo<S1, SN...>> { ::: };
|
||||
* template<> struct X<Foo<>> { ::: };
|
||||
* This will work in Intel's compiler 13.0, but only to some extent in g++ 4.6, since
|
||||
* g++ can only match templates called with parameter packs if the number of template
|
||||
* arguments is not a fixed size (so inside the first specialization, referencing
|
||||
* X<Foo<Sn...>> will fail in g++). On the other hand, g++ will accept the following:
|
||||
* template<typename S...> struct X<Foo<S...>> { ::: }:
|
||||
* as an additional (!) specialization, which will then only match the empty case.
|
||||
* But Intel's compiler 13.0 won't accept that, it will only accept the empty syntax,
|
||||
* so we have to create a workaround for this.
|
||||
*/
|
||||
#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
|
||||
#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) mt... n
|
||||
#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n) , EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
|
||||
#define EIGEN_TPL_PP_SPEC_HACK_USE(n) n...
|
||||
#define EIGEN_TPL_PP_SPEC_HACK_USEC(n) , n...
|
||||
#else
|
||||
#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
|
||||
#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n)
|
||||
#define EIGEN_TPL_PP_SPEC_HACK_USE(n)
|
||||
#define EIGEN_TPL_PP_SPEC_HACK_USEC(n)
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11WORKAROUNDS_H
|
||||
|
||||
/*
|
||||
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
|
||||
*/
|
|
@ -0,0 +1,267 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_EMULATE_ARRAY_H
|
||||
#define EIGEN_EMULATE_ARRAY_H
|
||||
|
||||
|
||||
|
||||
// The array class is only available starting with cxx11. Emulate our own here
|
||||
// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
|
||||
// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
|
||||
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY)
|
||||
|
||||
namespace Eigen {
|
||||
template <typename T, size_t n> class array {
|
||||
public:
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE T& front() { return values[0]; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const T& front() const { return values[0]; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE T& back() { return values[n-1]; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
static std::size_t size() { return n; }
|
||||
|
||||
T values[n];
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE array() { }
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE array(const T& v) {
|
||||
EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
values[0] = v;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE array(const T& v1, const T& v2) {
|
||||
EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
values[0] = v1;
|
||||
values[1] = v2;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) {
|
||||
EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
values[0] = v1;
|
||||
values[1] = v2;
|
||||
values[2] = v3;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3,
|
||||
const T& v4) {
|
||||
EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
values[0] = v1;
|
||||
values[1] = v2;
|
||||
values[2] = v3;
|
||||
values[3] = v4;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
|
||||
const T& v5) {
|
||||
EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
values[0] = v1;
|
||||
values[1] = v2;
|
||||
values[2] = v3;
|
||||
values[3] = v4;
|
||||
values[4] = v5;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
|
||||
const T& v5, const T& v6) {
|
||||
EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
values[0] = v1;
|
||||
values[1] = v2;
|
||||
values[2] = v3;
|
||||
values[3] = v4;
|
||||
values[4] = v5;
|
||||
values[5] = v6;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
|
||||
const T& v5, const T& v6, const T& v7) {
|
||||
EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
values[0] = v1;
|
||||
values[1] = v2;
|
||||
values[2] = v3;
|
||||
values[3] = v4;
|
||||
values[4] = v5;
|
||||
values[5] = v6;
|
||||
values[6] = v7;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE array(
|
||||
const T& v1, const T& v2, const T& v3, const T& v4,
|
||||
const T& v5, const T& v6, const T& v7, const T& v8) {
|
||||
EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
values[0] = v1;
|
||||
values[1] = v2;
|
||||
values[2] = v3;
|
||||
values[3] = v4;
|
||||
values[4] = v5;
|
||||
values[5] = v6;
|
||||
values[6] = v7;
|
||||
values[7] = v8;
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE array(std::initializer_list<T> l) {
|
||||
eigen_assert(l.size() == n);
|
||||
internal::smart_copy(l.begin(), l.end(), values);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
// Specialize array for zero size
|
||||
template <typename T> class array<T, 0> {
|
||||
public:
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE T& operator[] (size_t) {
|
||||
eigen_assert(false && "Can't index a zero size array");
|
||||
return dummy;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const T& operator[] (size_t) const {
|
||||
eigen_assert(false && "Can't index a zero size array");
|
||||
return dummy;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE T& front() {
|
||||
eigen_assert(false && "Can't index a zero size array");
|
||||
return dummy;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const T& front() const {
|
||||
eigen_assert(false && "Can't index a zero size array");
|
||||
return dummy;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE T& back() {
|
||||
eigen_assert(false && "Can't index a zero size array");
|
||||
return dummy;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const T& back() const {
|
||||
eigen_assert(false && "Can't index a zero size array");
|
||||
return dummy;
|
||||
}
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE array() : dummy() { }
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
|
||||
eigen_assert(l.size() == 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
private:
|
||||
T dummy;
|
||||
};
|
||||
|
||||
// Comparison operator
|
||||
// Todo: implement !=, <, <=, >, and >=
|
||||
template<class T, std::size_t N>
|
||||
EIGEN_DEVICE_FUNC bool operator==(const array<T,N>& lhs, const array<T,N>& rhs) {
|
||||
for (std::size_t i = 0; i < N; ++i) {
|
||||
if (lhs[i] != rhs[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
namespace internal {
|
||||
template<std::size_t I, class T, std::size_t N>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
|
||||
return a[I];
|
||||
}
|
||||
template<std::size_t I, class T, std::size_t N>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
|
||||
return a[I];
|
||||
}
|
||||
|
||||
template <typename T> struct array_size;
|
||||
template<class T, std::size_t N> struct array_size<array<T,N> > {
|
||||
static const size_t value = N;
|
||||
};
|
||||
template <typename T> struct array_size;
|
||||
template<class T, std::size_t N> struct array_size<array<T,N>& > {
|
||||
static const size_t value = N;
|
||||
};
|
||||
template <typename T> struct array_size;
|
||||
template<class T, std::size_t N> struct array_size<const array<T,N> > {
|
||||
static const size_t value = N;
|
||||
};
|
||||
template <typename T> struct array_size;
|
||||
template<class T, std::size_t N> struct array_size<const array<T,N>& > {
|
||||
static const size_t value = N;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
#else
|
||||
|
||||
// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen::array
|
||||
#include <array>
|
||||
namespace Eigen {
|
||||
|
||||
template <typename T, std::size_t N> using array = std::array<T, N>;
|
||||
|
||||
namespace internal {
|
||||
/* std::get is only constexpr in C++14, not yet in C++11
|
||||
* - libstdc++ from version 4.7 onwards has it nevertheless,
|
||||
* so use that
|
||||
* - libstdc++ older versions: use _M_instance directly
|
||||
* - libc++ all versions so far: use __elems_ directly
|
||||
* - all other libs: use std::get to be portable, but
|
||||
* this may not be constexpr
|
||||
*/
|
||||
#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
|
||||
#define STD_GET_ARR_HACK a._M_instance[I]
|
||||
#elif defined(_LIBCPP_VERSION)
|
||||
#define STD_GET_ARR_HACK a.__elems_[I]
|
||||
#else
|
||||
#define STD_GET_ARR_HACK std::template get<I, T, N>(a)
|
||||
#endif
|
||||
|
||||
template<std::size_t I, class T, std::size_t N> constexpr inline T& array_get(std::array<T,N>& a) { return (T&) STD_GET_ARR_HACK; }
|
||||
template<std::size_t I, class T, std::size_t N> constexpr inline T&& array_get(std::array<T,N>&& a) { return (T&&) STD_GET_ARR_HACK; }
|
||||
template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
|
||||
|
||||
#undef STD_GET_ARR_HACK
|
||||
|
||||
template <typename T> struct array_size;
|
||||
template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
|
||||
static const size_t value = N;
|
||||
};
|
||||
template <typename T> struct array_size;
|
||||
template<class T, std::size_t N> struct array_size<std::array<T,N> > {
|
||||
static const size_t value = N;
|
||||
};
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_EMULATE_ARRAY_H
|
|
@ -0,0 +1,311 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_EMULATE_CXX11_META_H
|
||||
#define EIGEN_EMULATE_CXX11_META_H
|
||||
|
||||
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
/** \internal
|
||||
* \file CXX11/util/EmulateCXX11Meta.h
|
||||
* This file emulates a subset of the functionality provided by CXXMeta.h for
|
||||
* compilers that don't yet support cxx11 such as nvcc.
|
||||
*/
|
||||
|
||||
struct empty_list { static const std::size_t count = 0; };
|
||||
|
||||
template<typename T, typename Tail=empty_list> struct type_list {
|
||||
typedef T HeadType;
|
||||
typedef Tail TailType;
|
||||
static const T head;
|
||||
static const Tail tail;
|
||||
static const std::size_t count = 1 + Tail::count;
|
||||
};
|
||||
|
||||
struct null_type { };
|
||||
|
||||
template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type,
|
||||
typename T4 = null_type, typename T5 = null_type, typename T6 = null_type,
|
||||
typename T7 = null_type, typename T8 = null_type>
|
||||
struct make_type_list {
|
||||
typedef typename make_type_list<T2, T3, T4, T5, T6, T7, T8>::type tailresult;
|
||||
|
||||
typedef type_list<T1, tailresult> type;
|
||||
};
|
||||
|
||||
template<> struct make_type_list<> {
|
||||
typedef empty_list type;
|
||||
};
|
||||
|
||||
|
||||
template <std::size_t index, class TList> struct get_type;
|
||||
|
||||
template <class Head, class Tail>
|
||||
struct get_type<0, type_list<Head, Tail> >
|
||||
{
|
||||
typedef Head type;
|
||||
};
|
||||
|
||||
template <std::size_t i, class Head, class Tail>
|
||||
struct get_type<i, type_list<Head, Tail> >
|
||||
{
|
||||
typedef typename get_type<i-1, Tail>::type type;
|
||||
};
|
||||
|
||||
|
||||
/* numeric list */
|
||||
template <typename T, T n>
|
||||
struct type2val {
|
||||
typedef T type;
|
||||
static const T value = n;
|
||||
};
|
||||
|
||||
|
||||
template<typename T, size_t n, T V> struct gen_numeric_list_repeated;
|
||||
|
||||
template<typename T, T V> struct gen_numeric_list_repeated<T, 1, V> {
|
||||
typedef typename make_type_list<type2val<T, V> >::type type;
|
||||
};
|
||||
|
||||
template<typename T, T V> struct gen_numeric_list_repeated<T, 2, V> {
|
||||
typedef typename make_type_list<type2val<T, V>, type2val<T, V> >::type type;
|
||||
};
|
||||
|
||||
template<typename T, T V> struct gen_numeric_list_repeated<T, 3, V> {
|
||||
typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
|
||||
};
|
||||
|
||||
template<typename T, T V> struct gen_numeric_list_repeated<T, 4, V> {
|
||||
typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
|
||||
};
|
||||
|
||||
template<typename T, T V> struct gen_numeric_list_repeated<T, 5, V> {
|
||||
typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
|
||||
};
|
||||
|
||||
template<typename T, T V> struct gen_numeric_list_repeated<T, 6, V> {
|
||||
typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
|
||||
type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
|
||||
};
|
||||
|
||||
template<typename T, T V> struct gen_numeric_list_repeated<T, 7, V> {
|
||||
typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
|
||||
type2val<T, V>, type2val<T, V>, type2val<T, V>,
|
||||
type2val<T, V> >::type type;
|
||||
};
|
||||
|
||||
template<typename T, T V> struct gen_numeric_list_repeated<T, 8, V> {
|
||||
typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
|
||||
type2val<T, V>, type2val<T, V>, type2val<T, V>,
|
||||
type2val<T, V>, type2val<T, V> >::type type;
|
||||
};
|
||||
|
||||
|
||||
template <std::size_t index, class NList> struct get;
|
||||
|
||||
template <std::size_t i>
|
||||
struct get<i, empty_list>
|
||||
{
|
||||
get() { eigen_assert(false && "index overflow"); }
|
||||
typedef void type;
|
||||
static const char value = '\0';
|
||||
};
|
||||
|
||||
template <std::size_t i, class Head>
|
||||
struct get<i, type_list<Head, empty_list> >
|
||||
{
|
||||
get() { eigen_assert(false && "index overflow"); }
|
||||
typedef void type;
|
||||
static const char value = '\0';
|
||||
};
|
||||
|
||||
template <class Head>
|
||||
struct get<0, type_list<Head, empty_list> >
|
||||
{
|
||||
typedef typename Head::type type;
|
||||
static const type value = Head::value;
|
||||
};
|
||||
|
||||
template <class Head, class Tail>
|
||||
struct get<0, type_list<Head, Tail> >
|
||||
{
|
||||
typedef typename Head::type type;
|
||||
static const type value = Head::value;
|
||||
};
|
||||
|
||||
template <std::size_t i, class Head, class Tail>
|
||||
struct get<i, type_list<Head, Tail> >
|
||||
{
|
||||
typedef typename Tail::HeadType::type type;
|
||||
static const type value = get<i-1, Tail>::value;
|
||||
};
|
||||
|
||||
|
||||
template <class NList> struct arg_prod {
|
||||
static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod<typename NList::TailType>::value;
|
||||
};
|
||||
template <> struct arg_prod<empty_list> {
|
||||
static const int value = 1;
|
||||
};
|
||||
|
||||
|
||||
template<int n, typename t>
|
||||
array<t, n> repeat(t v) {
|
||||
array<t, n> array;
|
||||
array.fill(v);
|
||||
return array;
|
||||
}
|
||||
|
||||
template<std::size_t I, class Head, class Tail>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>&) {
|
||||
return get<I, type_list<Head, Tail> >::value;
|
||||
}
|
||||
template<std::size_t I, class Head, class Tail>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>&) {
|
||||
return get<I, type_list<Head, Tail> >::value;
|
||||
}
|
||||
|
||||
template <class NList>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList&) {
|
||||
return arg_prod<NList>::value;
|
||||
}
|
||||
|
||||
template<typename t, std::size_t n>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) {
|
||||
t prod = 1;
|
||||
for (size_t i = 0; i < n; ++i) { prod *= a[i]; }
|
||||
return prod;
|
||||
}
|
||||
template<typename t>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, 0>& /*a*/) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
template<typename t>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
|
||||
eigen_assert(a.size() > 0);
|
||||
t prod = 1;
|
||||
for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
|
||||
return prod;
|
||||
}
|
||||
|
||||
|
||||
template<std::size_t I, class T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector<T>& a) {
|
||||
return a[I];
|
||||
}
|
||||
template<std::size_t I, class T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector<T>& a) {
|
||||
return a[I];
|
||||
}
|
||||
|
||||
struct sum_op {
|
||||
template<typename A, typename B> static inline bool run(A a, B b) { return a + b; }
|
||||
};
|
||||
struct product_op {
|
||||
template<typename A, typename B> static inline bool run(A a, B b) { return a * b; }
|
||||
};
|
||||
|
||||
struct logical_and_op {
|
||||
template<typename A, typename B> static inline bool run(A a, B b) { return a && b; }
|
||||
};
|
||||
struct logical_or_op {
|
||||
template<typename A, typename B> static inline bool run(A a, B b) { return a || b; }
|
||||
};
|
||||
|
||||
struct equal_op {
|
||||
template<typename A, typename B> static inline bool run(A a, B b) { return a == b; }
|
||||
};
|
||||
struct not_equal_op {
|
||||
template<typename A, typename B> static inline bool run(A a, B b) { return a != b; }
|
||||
};
|
||||
struct lesser_op {
|
||||
template<typename A, typename B> static inline bool run(A a, B b) { return a < b; }
|
||||
};
|
||||
struct lesser_equal_op {
|
||||
template<typename A, typename B> static inline bool run(A a, B b) { return a <= b; }
|
||||
};
|
||||
|
||||
struct greater_op {
|
||||
template<typename A, typename B> static inline bool run(A a, B b) { return a > b; }
|
||||
};
|
||||
struct greater_equal_op {
|
||||
template<typename A, typename B> static inline bool run(A a, B b) { return a >= b; }
|
||||
};
|
||||
|
||||
struct not_op {
|
||||
template<typename A> static inline bool run(A a) { return !a; }
|
||||
};
|
||||
struct negation_op {
|
||||
template<typename A> static inline bool run(A a) { return -a; }
|
||||
};
|
||||
struct greater_equal_zero_op {
|
||||
template<typename A> static inline bool run(A a) { return a >= 0; }
|
||||
};
|
||||
|
||||
|
||||
template<typename Reducer, typename Op, typename A, std::size_t N>
|
||||
struct ArrayApplyAndReduce {
|
||||
static inline bool run(const array<A, N>& a) {
|
||||
EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
bool result = Reducer::run(Op::run(a[0]), Op::run(a[1]));
|
||||
for (size_t i = 2; i < N; ++i) {
|
||||
result = Reducer::run(result, Op::run(a[i]));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Reducer, typename Op, typename A>
|
||||
struct ArrayApplyAndReduce<Reducer, Op, A, 1> {
|
||||
static inline bool run(const array<A, 1>& a) {
|
||||
return Op::run(a[0]);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Reducer, typename Op, typename A, std::size_t N>
|
||||
inline bool array_apply_and_reduce(const array<A, N>& a) {
|
||||
return ArrayApplyAndReduce<Reducer, Op, A, N>::run(a);
|
||||
}
|
||||
|
||||
template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
|
||||
struct ArrayZipAndReduce {
|
||||
static inline bool run(const array<A, N>& a, const array<B, N>& b) {
|
||||
EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1]));
|
||||
for (size_t i = 2; i < N; ++i) {
|
||||
result = Reducer::run(result, Op::run(a[i], b[i]));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Reducer, typename Op, typename A, typename B>
|
||||
struct ArrayZipAndReduce<Reducer, Op, A, B, 1> {
|
||||
static inline bool run(const array<A, 1>& a, const array<B, 1>& b) {
|
||||
return Op::run(a[0], b[0]);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
|
||||
inline bool array_zip_and_reduce(const array<A, N>& a, const array<B, N>& b) {
|
||||
return ArrayZipAndReduce<Reducer, Op, A, B, N>::run(a, b);
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
|
||||
|
||||
#endif // EIGEN_EMULATE_CXX11_META_H
|
|
@ -0,0 +1,141 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_FIXEDSIZEVECTOR_H
|
||||
#define EIGEN_FIXEDSIZEVECTOR_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class MaxSizeVector
|
||||
* \ingroup Core
|
||||
*
|
||||
* \brief The MaxSizeVector class.
|
||||
*
|
||||
* The %MaxSizeVector provides a subset of std::vector functionality.
|
||||
*
|
||||
* The goal is to provide basic std::vector operations when using
|
||||
* std::vector is not an option (e.g. on GPU or when compiling using
|
||||
* FMA/AVX, as this can cause either compilation failures or illegal
|
||||
* instruction failures).
|
||||
*
|
||||
* Beware: The constructors are not API compatible with these of
|
||||
* std::vector.
|
||||
*/
|
||||
template <typename T>
|
||||
class MaxSizeVector {
|
||||
public:
|
||||
// Construct a new MaxSizeVector, reserve n elements.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
explicit MaxSizeVector(size_t n)
|
||||
: reserve_(n), size_(0),
|
||||
data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
|
||||
for (size_t i = 0; i < n; ++i) { new (&data_[i]) T; }
|
||||
}
|
||||
|
||||
// Construct a new MaxSizeVector, reserve and resize to n.
|
||||
// Copy the init value to all elements.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
MaxSizeVector(size_t n, const T& init)
|
||||
: reserve_(n), size_(n),
|
||||
data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
|
||||
for (size_t i = 0; i < n; ++i) { new (&data_[i]) T(init); }
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
~MaxSizeVector() {
|
||||
for (size_t i = 0; i < size_; ++i) {
|
||||
data_[i].~T();
|
||||
}
|
||||
internal::aligned_free(data_);
|
||||
}
|
||||
|
||||
void resize(size_t n) {
|
||||
eigen_assert(n <= reserve_);
|
||||
for (size_t i = size_; i < n; ++i) {
|
||||
new (&data_[i]) T;
|
||||
}
|
||||
for (size_t i = n; i < size_; ++i) {
|
||||
data_[i].~T();
|
||||
}
|
||||
size_ = n;
|
||||
}
|
||||
|
||||
// Append new elements (up to reserved size).
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void push_back(const T& t) {
|
||||
eigen_assert(size_ < reserve_);
|
||||
data_[size_++] = t;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const T& operator[] (size_t i) const {
|
||||
eigen_assert(i < size_);
|
||||
return data_[i];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
T& operator[] (size_t i) {
|
||||
eigen_assert(i < size_);
|
||||
return data_[i];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
T& back() {
|
||||
eigen_assert(size_ > 0);
|
||||
return data_[size_ - 1];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const T& back() const {
|
||||
eigen_assert(size_ > 0);
|
||||
return data_[size_ - 1];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void pop_back() {
|
||||
// NOTE: This does not destroy the value at the end the way
|
||||
// std::vector's version of pop_back() does. That happens when
|
||||
// the Vector is destroyed.
|
||||
eigen_assert(size_ > 0);
|
||||
size_--;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
size_t size() const { return size_; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
bool empty() const { return size_ == 0; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
T* data() { return data_; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const T* data() const { return data_; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
T* begin() { return data_; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
T* end() { return data_ + size_; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const T* begin() const { return data_; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const T* end() const { return data_ + size_; }
|
||||
|
||||
private:
|
||||
size_t reserve_;
|
||||
size_t size_;
|
||||
T* data_;
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_FIXEDSIZEVECTOR_H
|
|
@ -0,0 +1,43 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_EULERANGLES_MODULE_H
|
||||
#define EIGEN_EULERANGLES_MODULE_H
|
||||
|
||||
|
||||
#include "Eigen/Core"
|
||||
#include "Eigen/Geometry"
|
||||
|
||||
#include "Eigen/src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/**
|
||||
* \defgroup EulerAngles_Module EulerAngles module
|
||||
* \brief This module provides generic euler angles rotation.
|
||||
*
|
||||
* Euler angles are a way to represent 3D rotation.
|
||||
*
|
||||
* In order to use this module in your code, include this header:
|
||||
* \code
|
||||
* #include <unsupported/Eigen/EulerAngles>
|
||||
* \endcode
|
||||
*
|
||||
* See \ref EulerAngles for more information.
|
||||
*
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
#include "src/EulerAngles/EulerSystem.h"
|
||||
#include "src/EulerAngles/EulerAngles.h"
|
||||
|
||||
#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
|
||||
|
||||
#endif // EIGEN_EULERANGLES_MODULE_H
|
|
@ -0,0 +1,419 @@
|
|||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2009 Mark Borgerding mark a borgerding net
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at the mozilla.org home page
|
||||
|
||||
#ifndef EIGEN_FFT_H
|
||||
#define EIGEN_FFT_H
|
||||
|
||||
#include <complex>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <Eigen/Core>
|
||||
|
||||
|
||||
/**
|
||||
* \defgroup FFT_Module Fast Fourier Transform module
|
||||
*
|
||||
* \code
|
||||
* #include <unsupported/Eigen/FFT>
|
||||
* \endcode
|
||||
*
|
||||
* This module provides Fast Fourier transformation, with a configurable backend
|
||||
* implementation.
|
||||
*
|
||||
* The default implementation is based on kissfft. It is a small, free, and
|
||||
* reasonably efficient default.
|
||||
*
|
||||
* There are currently two implementation backend:
|
||||
*
|
||||
* - fftw (xxxp://www.fftw.org) : faster, GPL -- incompatible with Eigen in LGPL form, bigger code size.
|
||||
* - MKL (xxxp://en.wikipedia.org/wiki/Math_Kernel_Library) : fastest, commercial -- may be incompatible with Eigen in GPL form.
|
||||
*
|
||||
* \section FFTDesign Design
|
||||
*
|
||||
* The following design decisions were made concerning scaling and
|
||||
* half-spectrum for real FFT.
|
||||
*
|
||||
* The intent is to facilitate generic programming and ease migrating code
|
||||
* from Matlab/octave.
|
||||
* We think the default behavior of Eigen/FFT should favor correctness and
|
||||
* generality over speed. Of course, the caller should be able to "opt-out" from this
|
||||
* behavior and get the speed increase if they want it.
|
||||
*
|
||||
* 1) %Scaling:
|
||||
* Other libraries (FFTW,IMKL,KISSFFT) do not perform scaling, so there
|
||||
* is a constant gain incurred after the forward&inverse transforms , so
|
||||
* IFFT(FFT(x)) = Kx; this is done to avoid a vector-by-value multiply.
|
||||
* The downside is that algorithms that worked correctly in Matlab/octave
|
||||
* don't behave the same way once implemented in C++.
|
||||
*
|
||||
* How Eigen/FFT differs: invertible scaling is performed so IFFT( FFT(x) ) = x.
|
||||
*
|
||||
* 2) Real FFT half-spectrum
|
||||
* Other libraries use only half the frequency spectrum (plus one extra
|
||||
* sample for the Nyquist bin) for a real FFT, the other half is the
|
||||
* conjugate-symmetric of the first half. This saves them a copy and some
|
||||
* memory. The downside is the caller needs to have special logic for the
|
||||
* number of bins in complex vs real.
|
||||
*
|
||||
* How Eigen/FFT differs: The full spectrum is returned from the forward
|
||||
* transform. This facilitates generic template programming by obviating
|
||||
* separate specializations for real vs complex. On the inverse
|
||||
* transform, only half the spectrum is actually used if the output type is real.
|
||||
*/
|
||||
|
||||
|
||||
#ifdef EIGEN_FFTW_DEFAULT
|
||||
// FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
|
||||
# include <fftw3.h>
|
||||
# include "src/FFT/ei_fftw_impl.h"
|
||||
namespace Eigen {
|
||||
//template <typename T> typedef struct internal::fftw_impl default_fft_impl; this does not work
|
||||
template <typename T> struct default_fft_impl : public internal::fftw_impl<T> {};
|
||||
}
|
||||
#elif defined EIGEN_MKL_DEFAULT
|
||||
// TODO
|
||||
// intel Math Kernel Library: fastest, commercial -- may be incompatible with Eigen in GPL form
|
||||
# include "src/FFT/ei_imklfft_impl.h"
|
||||
namespace Eigen {
|
||||
template <typename T> struct default_fft_impl : public internal::imklfft_impl {};
|
||||
}
|
||||
#else
|
||||
// internal::kissfft_impl: small, free, reasonably efficient default, derived from kissfft
|
||||
//
|
||||
# include "src/FFT/ei_kissfft_impl.h"
|
||||
namespace Eigen {
|
||||
template <typename T>
|
||||
struct default_fft_impl : public internal::kissfft_impl<T> {};
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
|
||||
//
|
||||
template<typename T_SrcMat,typename T_FftIfc> struct fft_fwd_proxy;
|
||||
template<typename T_SrcMat,typename T_FftIfc> struct fft_inv_proxy;
|
||||
|
||||
namespace internal {
|
||||
template<typename T_SrcMat,typename T_FftIfc>
|
||||
struct traits< fft_fwd_proxy<T_SrcMat,T_FftIfc> >
|
||||
{
|
||||
typedef typename T_SrcMat::PlainObject ReturnType;
|
||||
};
|
||||
template<typename T_SrcMat,typename T_FftIfc>
|
||||
struct traits< fft_inv_proxy<T_SrcMat,T_FftIfc> >
|
||||
{
|
||||
typedef typename T_SrcMat::PlainObject ReturnType;
|
||||
};
|
||||
}
|
||||
|
||||
template<typename T_SrcMat,typename T_FftIfc>
|
||||
struct fft_fwd_proxy
|
||||
: public ReturnByValue<fft_fwd_proxy<T_SrcMat,T_FftIfc> >
|
||||
{
|
||||
typedef DenseIndex Index;
|
||||
|
||||
fft_fwd_proxy(const T_SrcMat& src,T_FftIfc & fft, Index nfft) : m_src(src),m_ifc(fft), m_nfft(nfft) {}
|
||||
|
||||
template<typename T_DestMat> void evalTo(T_DestMat& dst) const;
|
||||
|
||||
Index rows() const { return m_src.rows(); }
|
||||
Index cols() const { return m_src.cols(); }
|
||||
protected:
|
||||
const T_SrcMat & m_src;
|
||||
T_FftIfc & m_ifc;
|
||||
Index m_nfft;
|
||||
private:
|
||||
fft_fwd_proxy& operator=(const fft_fwd_proxy&);
|
||||
};
|
||||
|
||||
template<typename T_SrcMat,typename T_FftIfc>
|
||||
struct fft_inv_proxy
|
||||
: public ReturnByValue<fft_inv_proxy<T_SrcMat,T_FftIfc> >
|
||||
{
|
||||
typedef DenseIndex Index;
|
||||
|
||||
fft_inv_proxy(const T_SrcMat& src,T_FftIfc & fft, Index nfft) : m_src(src),m_ifc(fft), m_nfft(nfft) {}
|
||||
|
||||
template<typename T_DestMat> void evalTo(T_DestMat& dst) const;
|
||||
|
||||
Index rows() const { return m_src.rows(); }
|
||||
Index cols() const { return m_src.cols(); }
|
||||
protected:
|
||||
const T_SrcMat & m_src;
|
||||
T_FftIfc & m_ifc;
|
||||
Index m_nfft;
|
||||
private:
|
||||
fft_inv_proxy& operator=(const fft_inv_proxy&);
|
||||
};
|
||||
|
||||
|
||||
template <typename T_Scalar,
|
||||
typename T_Impl=default_fft_impl<T_Scalar> >
|
||||
class FFT
|
||||
{
|
||||
public:
|
||||
typedef T_Impl impl_type;
|
||||
typedef DenseIndex Index;
|
||||
typedef typename impl_type::Scalar Scalar;
|
||||
typedef typename impl_type::Complex Complex;
|
||||
|
||||
enum Flag {
|
||||
Default=0, // goof proof
|
||||
Unscaled=1,
|
||||
HalfSpectrum=2,
|
||||
// SomeOtherSpeedOptimization=4
|
||||
Speedy=32767
|
||||
};
|
||||
|
||||
FFT( const impl_type & impl=impl_type() , Flag flags=Default ) :m_impl(impl),m_flag(flags) { }
|
||||
|
||||
inline
|
||||
bool HasFlag(Flag f) const { return (m_flag & (int)f) == f;}
|
||||
|
||||
inline
|
||||
void SetFlag(Flag f) { m_flag |= (int)f;}
|
||||
|
||||
inline
|
||||
void ClearFlag(Flag f) { m_flag &= (~(int)f);}
|
||||
|
||||
inline
|
||||
void fwd( Complex * dst, const Scalar * src, Index nfft)
|
||||
{
|
||||
m_impl.fwd(dst,src,static_cast<int>(nfft));
|
||||
if ( HasFlag(HalfSpectrum) == false)
|
||||
ReflectSpectrum(dst,nfft);
|
||||
}
|
||||
|
||||
inline
|
||||
void fwd( Complex * dst, const Complex * src, Index nfft)
|
||||
{
|
||||
m_impl.fwd(dst,src,static_cast<int>(nfft));
|
||||
}
|
||||
|
||||
/*
|
||||
inline
|
||||
void fwd2(Complex * dst, const Complex * src, int n0,int n1)
|
||||
{
|
||||
m_impl.fwd2(dst,src,n0,n1);
|
||||
}
|
||||
*/
|
||||
|
||||
template <typename _Input>
|
||||
inline
|
||||
void fwd( std::vector<Complex> & dst, const std::vector<_Input> & src)
|
||||
{
|
||||
if ( NumTraits<_Input>::IsComplex == 0 && HasFlag(HalfSpectrum) )
|
||||
dst.resize( (src.size()>>1)+1); // half the bins + Nyquist bin
|
||||
else
|
||||
dst.resize(src.size());
|
||||
fwd(&dst[0],&src[0],src.size());
|
||||
}
|
||||
|
||||
template<typename InputDerived, typename ComplexDerived>
|
||||
inline
|
||||
void fwd( MatrixBase<ComplexDerived> & dst, const MatrixBase<InputDerived> & src, Index nfft=-1)
|
||||
{
|
||||
typedef typename ComplexDerived::Scalar dst_type;
|
||||
typedef typename InputDerived::Scalar src_type;
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(InputDerived)
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(ComplexDerived)
|
||||
EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(ComplexDerived,InputDerived) // size at compile-time
|
||||
EIGEN_STATIC_ASSERT((internal::is_same<dst_type, Complex>::value),
|
||||
YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
|
||||
EIGEN_STATIC_ASSERT(int(InputDerived::Flags)&int(ComplexDerived::Flags)&DirectAccessBit,
|
||||
THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
|
||||
|
||||
if (nfft<1)
|
||||
nfft = src.size();
|
||||
|
||||
if ( NumTraits< src_type >::IsComplex == 0 && HasFlag(HalfSpectrum) )
|
||||
dst.derived().resize( (nfft>>1)+1);
|
||||
else
|
||||
dst.derived().resize(nfft);
|
||||
|
||||
if ( src.innerStride() != 1 || src.size() < nfft ) {
|
||||
Matrix<src_type,1,Dynamic> tmp;
|
||||
if (src.size()<nfft) {
|
||||
tmp.setZero(nfft);
|
||||
tmp.block(0,0,src.size(),1 ) = src;
|
||||
}else{
|
||||
tmp = src;
|
||||
}
|
||||
fwd( &dst[0],&tmp[0],nfft );
|
||||
}else{
|
||||
fwd( &dst[0],&src[0],nfft );
|
||||
}
|
||||
}
|
||||
|
||||
template<typename InputDerived>
|
||||
inline
|
||||
fft_fwd_proxy< MatrixBase<InputDerived>, FFT<T_Scalar,T_Impl> >
|
||||
fwd( const MatrixBase<InputDerived> & src, Index nfft=-1)
|
||||
{
|
||||
return fft_fwd_proxy< MatrixBase<InputDerived> ,FFT<T_Scalar,T_Impl> >( src, *this,nfft );
|
||||
}
|
||||
|
||||
template<typename InputDerived>
|
||||
inline
|
||||
fft_inv_proxy< MatrixBase<InputDerived>, FFT<T_Scalar,T_Impl> >
|
||||
inv( const MatrixBase<InputDerived> & src, Index nfft=-1)
|
||||
{
|
||||
return fft_inv_proxy< MatrixBase<InputDerived> ,FFT<T_Scalar,T_Impl> >( src, *this,nfft );
|
||||
}
|
||||
|
||||
inline
|
||||
void inv( Complex * dst, const Complex * src, Index nfft)
|
||||
{
|
||||
m_impl.inv( dst,src,static_cast<int>(nfft) );
|
||||
if ( HasFlag( Unscaled ) == false)
|
||||
scale(dst,Scalar(1./nfft),nfft); // scale the time series
|
||||
}
|
||||
|
||||
inline
|
||||
void inv( Scalar * dst, const Complex * src, Index nfft)
|
||||
{
|
||||
m_impl.inv( dst,src,static_cast<int>(nfft) );
|
||||
if ( HasFlag( Unscaled ) == false)
|
||||
scale(dst,Scalar(1./nfft),nfft); // scale the time series
|
||||
}
|
||||
|
||||
template<typename OutputDerived, typename ComplexDerived>
|
||||
inline
|
||||
void inv( MatrixBase<OutputDerived> & dst, const MatrixBase<ComplexDerived> & src, Index nfft=-1)
|
||||
{
|
||||
typedef typename ComplexDerived::Scalar src_type;
|
||||
typedef typename ComplexDerived::RealScalar real_type;
|
||||
typedef typename OutputDerived::Scalar dst_type;
|
||||
const bool realfft= (NumTraits<dst_type>::IsComplex == 0);
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OutputDerived)
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(ComplexDerived)
|
||||
EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(ComplexDerived,OutputDerived) // size at compile-time
|
||||
EIGEN_STATIC_ASSERT((internal::is_same<src_type, Complex>::value),
|
||||
YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
|
||||
EIGEN_STATIC_ASSERT(int(OutputDerived::Flags)&int(ComplexDerived::Flags)&DirectAccessBit,
|
||||
THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
|
||||
|
||||
if (nfft<1) { //automatic FFT size determination
|
||||
if ( realfft && HasFlag(HalfSpectrum) )
|
||||
nfft = 2*(src.size()-1); //assume even fft size
|
||||
else
|
||||
nfft = src.size();
|
||||
}
|
||||
dst.derived().resize( nfft );
|
||||
|
||||
// check for nfft that does not fit the input data size
|
||||
Index resize_input= ( realfft && HasFlag(HalfSpectrum) )
|
||||
? ( (nfft/2+1) - src.size() )
|
||||
: ( nfft - src.size() );
|
||||
|
||||
if ( src.innerStride() != 1 || resize_input ) {
|
||||
// if the vector is strided, then we need to copy it to a packed temporary
|
||||
Matrix<src_type,1,Dynamic> tmp;
|
||||
if ( resize_input ) {
|
||||
size_t ncopy = (std::min)(src.size(),src.size() + resize_input);
|
||||
tmp.setZero(src.size() + resize_input);
|
||||
if ( realfft && HasFlag(HalfSpectrum) ) {
|
||||
// pad at the Nyquist bin
|
||||
tmp.head(ncopy) = src.head(ncopy);
|
||||
tmp(ncopy-1) = real(tmp(ncopy-1)); // enforce real-only Nyquist bin
|
||||
}else{
|
||||
size_t nhead,ntail;
|
||||
nhead = 1+ncopy/2-1; // range [0:pi)
|
||||
ntail = ncopy/2-1; // range (-pi:0)
|
||||
tmp.head(nhead) = src.head(nhead);
|
||||
tmp.tail(ntail) = src.tail(ntail);
|
||||
if (resize_input<0) { //shrinking -- create the Nyquist bin as the average of the two bins that fold into it
|
||||
tmp(nhead) = ( src(nfft/2) + src( src.size() - nfft/2 ) )*real_type(.5);
|
||||
}else{ // expanding -- split the old Nyquist bin into two halves
|
||||
tmp(nhead) = src(nhead) * real_type(.5);
|
||||
tmp(tmp.size()-nhead) = tmp(nhead);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
tmp = src;
|
||||
}
|
||||
inv( &dst[0],&tmp[0], nfft);
|
||||
}else{
|
||||
inv( &dst[0],&src[0], nfft);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename _Output>
|
||||
inline
|
||||
void inv( std::vector<_Output> & dst, const std::vector<Complex> & src,Index nfft=-1)
|
||||
{
|
||||
if (nfft<1)
|
||||
nfft = ( NumTraits<_Output>::IsComplex == 0 && HasFlag(HalfSpectrum) ) ? 2*(src.size()-1) : src.size();
|
||||
dst.resize( nfft );
|
||||
inv( &dst[0],&src[0],nfft);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
// TODO: multi-dimensional FFTs
|
||||
inline
|
||||
void inv2(Complex * dst, const Complex * src, int n0,int n1)
|
||||
{
|
||||
m_impl.inv2(dst,src,n0,n1);
|
||||
if ( HasFlag( Unscaled ) == false)
|
||||
scale(dst,1./(n0*n1),n0*n1);
|
||||
}
|
||||
*/
|
||||
|
||||
inline
|
||||
impl_type & impl() {return m_impl;}
|
||||
private:
|
||||
|
||||
template <typename T_Data>
|
||||
inline
|
||||
void scale(T_Data * x,Scalar s,Index nx)
|
||||
{
|
||||
#if 1
|
||||
for (int k=0;k<nx;++k)
|
||||
*x++ *= s;
|
||||
#else
|
||||
if ( ((ptrdiff_t)x) & 15 )
|
||||
Matrix<T_Data, Dynamic, 1>::Map(x,nx) *= s;
|
||||
else
|
||||
Matrix<T_Data, Dynamic, 1>::MapAligned(x,nx) *= s;
|
||||
//Matrix<T_Data, Dynamic, Dynamic>::Map(x,nx) * s;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline
|
||||
void ReflectSpectrum(Complex * freq, Index nfft)
|
||||
{
|
||||
// create the implicit right-half spectrum (conjugate-mirror of the left-half)
|
||||
Index nhbins=(nfft>>1)+1;
|
||||
for (Index k=nhbins;k < nfft; ++k )
|
||||
freq[k] = conj(freq[nfft-k]);
|
||||
}
|
||||
|
||||
impl_type m_impl;
|
||||
int m_flag;
|
||||
};
|
||||
|
||||
template<typename T_SrcMat,typename T_FftIfc>
|
||||
template<typename T_DestMat> inline
|
||||
void fft_fwd_proxy<T_SrcMat,T_FftIfc>::evalTo(T_DestMat& dst) const
|
||||
{
|
||||
m_ifc.fwd( dst, m_src, m_nfft);
|
||||
}
|
||||
|
||||
template<typename T_SrcMat,typename T_FftIfc>
|
||||
template<typename T_DestMat> inline
|
||||
void fft_inv_proxy<T_SrcMat,T_FftIfc>::evalTo(T_DestMat& dst) const
|
||||
{
|
||||
m_ifc.inv( dst, m_src, m_nfft);
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
/* vim: set filetype=cpp et sw=2 ts=2 ai: */
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue