From 5556f4184c3676c1587b3578391c319aec885894 Mon Sep 17 00:00:00 2001 From: korialis Date: Tue, 16 Nov 2021 15:29:10 +0100 Subject: [PATCH 01/38] fixed some const correctness issues on tetrahedrons --- vcg/complex/algorithms/stat.h | 2 +- vcg/complex/allocate.h | 2 +- vcg/simplex/tetrahedron/component.h | 8 +++++--- vcg/simplex/vertex/component.h | 1 + 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/vcg/complex/algorithms/stat.h b/vcg/complex/algorithms/stat.h index c9b0e7cb..14a6d8fd 100644 --- a/vcg/complex/algorithms/stat.h +++ b/vcg/complex/algorithms/stat.h @@ -250,7 +250,7 @@ public: return V; } - static ScalarType ComputeMeshVolume(const MeshType & m) + static ScalarType ComputeMeshVolume(MeshType & m) { Inertia I(m); return I.Mass(); diff --git a/vcg/complex/allocate.h b/vcg/complex/allocate.h index ee96b2c9..72b9255e 100644 --- a/vcg/complex/allocate.h +++ b/vcg/complex/allocate.h @@ -2049,7 +2049,7 @@ public: } template - static typename MeshType::template ConstPerTetraAttributeHandle FindPerTetraAttribute(MeshType &m, const std::string &name) + static typename MeshType::template ConstPerTetraAttributeHandle FindPerTetraAttribute(const MeshType &m, const std::string &name) { if(!name.empty()){ PointerToAttribute h1; diff --git a/vcg/simplex/tetrahedron/component.h b/vcg/simplex/tetrahedron/component.h index cb1a6e87..9ba9d0ed 100644 --- a/vcg/simplex/tetrahedron/component.h +++ b/vcg/simplex/tetrahedron/component.h @@ -48,7 +48,7 @@ template class EmptyCore : public T { public: //Empty vertexref inline typename T::VertexType * & V( const int ) { assert(0); static typename T::VertexType *vp=0; return vp; } - inline typename T::VertexType * const & V( const int ) const { assert(0); static typename T::VertexType *vp=0; return vp; } + inline const typename T::VertexType * V( const int ) const { assert(0); static typename T::VertexType *vp=0; return vp; } inline const typename T::VertexType * cV( const int ) const { assert(0); static typename T::VertexType *vp=0; return vp; } inline typename T::CoordType & P( const int ) { assert(0); static typename T::CoordType coord(0, 0, 0); return coord; } inline const typename T::CoordType & P( const int ) const { assert(0); static typename T::CoordType coord(0, 0, 0); return coord; } @@ -181,7 +181,8 @@ public: typedef typename T::VertexType::ScalarType ScalarType; inline typename T::VertexType * & V( const int j ) { assert(j>=0 && j<4); return v[j]; } - inline const typename T::VertexType * cV( const int j ) { assert(j>=0 && j<4); return v[j]; } + inline const typename T::VertexType * V( const int j ) const { assert(j>=0 && j<4); return v[j]; } + inline const typename T::VertexType * cV( const int j ) const { assert(j>=0 && j<4); return v[j]; } inline size_t cFtoVi (const int f, const int j) const { assert(f >= 0 && f < 4); assert(j >= 0 && j < 3); return findices[f][j]; } @@ -457,7 +458,8 @@ public: typename T::TetraPointer const cVTp( const int j ) const { assert( j >= 0 && j < 4 ); return _vtp[j]; } char & VTi( const int j ) { return _vti[j]; } - const char & cVTi( const int j ) const { return _vti[j]; } + char VTi( const int j ) const { return _vti[j]; } + char cVTi( const int j ) const { return _vti[j]; } static bool HasVTAdjacency() { return true; } static bool HasVTAdjacencyOcc() { return false; } diff --git a/vcg/simplex/vertex/component.h b/vcg/simplex/vertex/component.h index 37f27f93..251e3786 100644 --- a/vcg/simplex/vertex/component.h +++ b/vcg/simplex/vertex/component.h @@ -572,6 +572,7 @@ public: typename T::TetraPointer &VTp() { return _tp; } typename T::TetraPointer cVTp() const { return _tp; } int &VTi() {return _zp; } + int VTi() const { return _zp; } int cVTi() const { return _zp; } static bool HasVTAdjacency() { return true; } static void Name( std::vector< std::string > & name ) { name.push_back( std::string("VTAdj") ); T::Name(name); } From 920ca0a9322303af7e133973104958456a4634f7 Mon Sep 17 00:00:00 2001 From: korialis Date: Tue, 16 Nov 2021 15:30:23 +0100 Subject: [PATCH 02/38] fixed tetrahedra import_ply propdescriptor --- wrap/io_tetramesh/import_ply.h | 46 +++++++++++++++++----------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/wrap/io_tetramesh/import_ply.h b/wrap/io_tetramesh/import_ply.h index e77b4741..ebd4589a 100644 --- a/wrap/io_tetramesh/import_ply.h +++ b/wrap/io_tetramesh/import_ply.h @@ -142,20 +142,20 @@ public: static const PropDescriptor &VertDesc(int i) { const static PropDescriptor pv[13]={ - /*00*/ {"vertex", "x", ply::T_FLOAT, PlyType(), offsetof(LoadPly_VertAux,p),0,0,0,0,0}, - /*01*/ {"vertex", "y", ply::T_FLOAT, PlyType(), offsetof(LoadPly_VertAux,p) + sizeof(ScalarType),0,0,0,0,0}, - /*02*/ {"vertex", "z", ply::T_FLOAT, PlyType(), offsetof(LoadPly_VertAux,p) + 2 * sizeof(ScalarType),0,0,0,0,0}, - /*03*/ {"vertex", "flags", ply::T_INT, ply::T_INT, offsetof(LoadPly_VertAux,flags),0,0,0,0,0}, - /*04*/ {"vertex", "quality", ply::T_FLOAT, ply::T_FLOAT, offsetof(LoadPly_VertAux,q),0,0,0,0,0}, - /*05*/ {"vertex", "red" , ply::T_UCHAR, ply::T_UCHAR, offsetof(LoadPly_VertAux,r),0,0,0,0,0}, - /*06*/ {"vertex", "green", ply::T_UCHAR, ply::T_UCHAR, offsetof(LoadPly_VertAux,g),0,0,0,0,0}, - /*07*/ {"vertex", "blue" , ply::T_UCHAR, ply::T_UCHAR, offsetof(LoadPly_VertAux,b),0,0,0,0,0}, - /*08*/ {"vertex", "alpha" , ply::T_UCHAR, ply::T_UCHAR, offsetof(LoadPly_VertAux,a),0,0,0,0,0}, + /*00*/ {"vertex", "x", ply::T_FLOAT, PlyType(), offsetof(LoadPly_VertAux,p), 0, 0, 0, 0, 0, 0}, + /*01*/ {"vertex", "y", ply::T_FLOAT, PlyType(), offsetof(LoadPly_VertAux,p) + sizeof(ScalarType), 0, 0, 0, 0, 0, 0}, + /*02*/ {"vertex", "z", ply::T_FLOAT, PlyType(), offsetof(LoadPly_VertAux,p) + 2*sizeof(ScalarType), 0, 0, 0, 0, 0, 0}, + /*03*/ {"vertex", "flags", ply::T_INT, ply::T_INT, offsetof(LoadPly_VertAux,flags), 0, 0, 0, 0, 0, 0}, + /*04*/ {"vertex", "quality", ply::T_FLOAT, PlyType(), offsetof(LoadPly_VertAux,q), 0, 0, 0, 0, 0, 0}, + /*05*/ {"vertex", "red" , ply::T_UCHAR, ply::T_UCHAR, offsetof(LoadPly_VertAux,r), 0, 0, 0, 0, 0, 0}, + /*06*/ {"vertex", "green", ply::T_UCHAR, ply::T_UCHAR, offsetof(LoadPly_VertAux,g), 0, 0, 0, 0, 0, 0}, + /*07*/ {"vertex", "blue" , ply::T_UCHAR, ply::T_UCHAR, offsetof(LoadPly_VertAux,b), 0, 0, 0, 0, 0, 0}, + /*08*/ {"vertex", "alpha" , ply::T_UCHAR, ply::T_UCHAR, offsetof(LoadPly_VertAux,a), 0, 0, 0, 0, 0, 0}, // DOUBLE - /*09*/ {"vertex", "x", ply::T_DOUBLE, PlyType(),offsetof(LoadPly_VertAux,p),0,0,0,0,0 ,0}, - /*10*/ {"vertex", "y", ply::T_DOUBLE, PlyType(),offsetof(LoadPly_VertAux,p) + sizeof(ScalarType) ,0,0,0,0,0 ,0}, - /*11*/ {"vertex", "z", ply::T_DOUBLE, PlyType(),offsetof(LoadPly_VertAux,p) + 2*sizeof(ScalarType),0,0,0,0,0 ,0}, - /*12*/ {"vertex", "quality", ply::T_DOUBLE, PlyType(),offsetof(LoadPly_VertAux,q),0,0,0,0,0 ,0} + /*09*/ {"vertex", "x", ply::T_DOUBLE, PlyType(),offsetof(LoadPly_VertAux,p), 0, 0, 0, 0, 0, 0}, + /*10*/ {"vertex", "y", ply::T_DOUBLE, PlyType(),offsetof(LoadPly_VertAux,p) + sizeof(ScalarType), 0, 0, 0, 0, 0, 0}, + /*11*/ {"vertex", "z", ply::T_DOUBLE, PlyType(),offsetof(LoadPly_VertAux,p) + 2*sizeof(ScalarType), 0, 0, 0, 0, 0, 0}, + /*12*/ {"vertex", "quality", ply::T_DOUBLE, PlyType(),offsetof(LoadPly_VertAux,q), 0, 0, 0, 0, 0, 0} }; return pv[i]; } @@ -165,16 +165,16 @@ public: { const static PropDescriptor qf[10]= { - {"tetra", "vertex_indices", ply::T_INT, ply::T_INT, offsetof(LoadPly_TetraAux,v), 1,0,ply::T_UCHAR,ply::T_UCHAR,offsetof(LoadPly_TetraAux,size) }, - {"tetra", "flags", ply::T_INT, ply::T_INT, offsetof(LoadPly_TetraAux,flags), 0,0,0,0,0}, - {"tetra", "quality", ply::T_FLOAT, ply::T_FLOAT, offsetof(LoadPly_TetraAux,q), 0,0,0,0,0}, - {"tetra", "texcoord", ply::T_FLOAT, ply::T_FLOAT, offsetof(LoadPly_TetraAux,texcoord), 1,0,ply::T_UCHAR,ply::T_UCHAR,offsetof(LoadPly_TetraAux,ntexcoord) }, - {"tetra", "color", ply::T_FLOAT, ply::T_FLOAT, offsetof(LoadPly_TetraAux,colors), 1,0,ply::T_UCHAR,ply::T_UCHAR,offsetof(LoadPly_TetraAux,ncolors) }, - {"tetra", "texnumber", ply::T_INT, ply::T_INT, offsetof(LoadPly_TetraAux,texcoordind), 0,0,0,0,0}, - {"tetra", "red" , ply::T_UCHAR, ply::T_UCHAR, offsetof(LoadPly_TetraAux,r), 0,0,0,0,0}, - {"tetra", "green", ply::T_UCHAR, ply::T_UCHAR, offsetof(LoadPly_TetraAux,g), 0,0,0,0,0}, - {"tetra", "blue" , ply::T_UCHAR, ply::T_UCHAR, offsetof(LoadPly_TetraAux,b), 0,0,0,0,0}, - {"tetra", "alpha" , ply::T_UCHAR, ply::T_UCHAR, offsetof(LoadPly_TetraAux,a), 0,0,0,0,0}, + {"tetra", "vertex_indices", ply::T_INT, ply::T_INT, offsetof(LoadPly_TetraAux,v), 1, 0, ply::T_UCHAR,ply::T_UCHAR,offsetof(LoadPly_TetraAux,size), 0}, + {"tetra", "flags", ply::T_INT, ply::T_INT, offsetof(LoadPly_TetraAux,flags), 0, 0, 0, 0, 0, 0}, + {"tetra", "quality", ply::T_FLOAT, ply::T_FLOAT, offsetof(LoadPly_TetraAux,q), 0, 0, 0, 0, 0, 0}, + {"tetra", "texcoord", ply::T_FLOAT, ply::T_FLOAT, offsetof(LoadPly_TetraAux,texcoord), 1, 0, ply::T_UCHAR,ply::T_UCHAR,offsetof(LoadPly_TetraAux,ntexcoord), 0}, + {"tetra", "color", ply::T_FLOAT, ply::T_FLOAT, offsetof(LoadPly_TetraAux,colors), 1, 0, ply::T_UCHAR,ply::T_UCHAR,offsetof(LoadPly_TetraAux,ncolors), 0}, + {"tetra", "texnumber", ply::T_INT, ply::T_INT, offsetof(LoadPly_TetraAux,texcoordind), 0, 0, 0, 0, 0, 0}, + {"tetra", "red" , ply::T_UCHAR, ply::T_UCHAR, offsetof(LoadPly_TetraAux,r), 0, 0, 0, 0, 0, 0}, + {"tetra", "green", ply::T_UCHAR, ply::T_UCHAR, offsetof(LoadPly_TetraAux,g), 0, 0, 0, 0, 0, 0}, + {"tetra", "blue" , ply::T_UCHAR, ply::T_UCHAR, offsetof(LoadPly_TetraAux,b), 0, 0, 0, 0, 0, 0}, + {"tetra", "alpha" , ply::T_UCHAR, ply::T_UCHAR, offsetof(LoadPly_TetraAux,a), 0, 0, 0, 0, 0, 0}, }; return qf[i]; From b36ec50e359378896c60b7ae5bb03f9055dcc870 Mon Sep 17 00:00:00 2001 From: korialis Date: Tue, 16 Nov 2021 15:36:42 +0100 Subject: [PATCH 03/38] removed unnecessary iheritance from deprecated std::binary_function --- vcg/complex/algorithms/geodesic.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/vcg/complex/algorithms/geodesic.h b/vcg/complex/algorithms/geodesic.h index db45cd2d..50f21896 100644 --- a/vcg/complex/algorithms/geodesic.h +++ b/vcg/complex/algorithms/geodesic.h @@ -213,12 +213,19 @@ public: typedef SimpleTempData, TempData > TempDataType; - struct pred: public std::binary_function{ - pred(){} - bool operator()(const VertDist& v0, const VertDist& v1) const - {return (v0.d > v1.d);} + class pred { + public: + pred () {}; + bool operator()(const VertDist& v0, const VertDist& v1) const + {return (v0.d > v1.d);} }; + //struct pred: public std::binary_function{ + // pred(){} + // bool operator()(const VertDist& v0, const VertDist& v1) const + // {return (v0.d > v1.d);} + //}; + /* * curr: vertex for which distance should be estimated From 9af7eb8158034dbb0b1ec816b82aa9f1585fde36 Mon Sep 17 00:00:00 2001 From: Luigi Malomo Date: Wed, 17 Nov 2021 14:51:54 +0100 Subject: [PATCH 04/38] code cleaning --- vcg/complex/algorithms/geodesic.h | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/vcg/complex/algorithms/geodesic.h b/vcg/complex/algorithms/geodesic.h index db45cd2d..f0cb512d 100644 --- a/vcg/complex/algorithms/geodesic.h +++ b/vcg/complex/algorithms/geodesic.h @@ -138,12 +138,6 @@ public: - - - - - - /*! \brief class for computing approximate geodesic distances on a mesh require VF Adjacency relation @@ -582,26 +576,19 @@ It is just a simple wrapper of the basic Compute() template static void PerVertexDijkstraCompute(MeshType &m, const std::vector &seedVec, DistanceFunctor &distFunc, - ScalarType maxDistanceThr = std::numeric_limits::max(), - std::vector *InInterval=NULL, - typename MeshType::template PerVertexAttributeHandle * sourceHandle= NULL, - typename MeshType::template PerVertexAttributeHandle * parentHandle=NULL, - bool avoid_selected=false, - VertexPointer target=NULL) + ScalarType maxDistanceThr = std::numeric_limits::max(), + std::vector *InInterval=NULL, + typename MeshType::template PerVertexAttributeHandle * sourceHandle= NULL, + typename MeshType::template PerVertexAttributeHandle * parentHandle=NULL, + bool avoid_selected=false, + VertexPointer target=NULL) { tri::RequireVFAdjacency(m); tri::RequirePerVertexMark(m); tri::RequirePerVertexQuality(m); -// typename MeshType::template PerVertexAttributeHandle sourceHandle -// = tri::Allocator::template GetPerVertexAttribute(m, sourcesAttributeName()); - -// typename MeshType::template PerVertexAttributeHandle parentHandle -// = tri::Allocator::template GetPerVertexAttribute (m, parentsAttributeName()); - std::vector Heap; tri::UnMarkAll(m); - tri::UnMarkAll(m); for(size_t i=0;i Date: Wed, 17 Nov 2021 15:12:21 +0100 Subject: [PATCH 05/38] fixed const correctness for Inertia and some Stat functions + code cleaning --- vcg/complex/algorithms/inertia.h | 141 ++++++++++++++++--------------- vcg/complex/algorithms/stat.h | 6 +- 2 files changed, 75 insertions(+), 72 deletions(-) diff --git a/vcg/complex/algorithms/inertia.h b/vcg/complex/algorithms/inertia.h index 14d92b2a..c68c22fa 100644 --- a/vcg/complex/algorithms/inertia.h +++ b/vcg/complex/algorithms/inertia.h @@ -53,16 +53,16 @@ namespace vcg template class Inertia { - typedef typename MeshType::VertexType VertexType; - typedef typename MeshType::VertexPointer VertexPointer; - typedef typename MeshType::VertexIterator VertexIterator; - typedef typename MeshType::ScalarType ScalarType; - typedef typename MeshType::FaceType FaceType; - typedef typename MeshType::FacePointer FacePointer; - typedef typename MeshType::FaceIterator FaceIterator; - typedef typename MeshType::ConstFaceIterator ConstFaceIterator; - typedef typename MeshType::FaceContainer FaceContainer; - typedef typename MeshType::CoordType CoordType; + typedef typename MeshType::VertexType VertexType; + typedef typename MeshType::VertexPointer VertexPointer; + typedef typename MeshType::VertexIterator VertexIterator; + typedef typename MeshType::ScalarType ScalarType; + typedef typename MeshType::FaceType FaceType; + typedef typename MeshType::FacePointer FacePointer; + typedef typename MeshType::FaceIterator FaceIterator; + typedef typename MeshType::ConstFaceIterator ConstFaceIterator; + typedef typename MeshType::FaceContainer FaceContainer; + typedef typename MeshType::CoordType CoordType; private : enum {X=0,Y=1,Z=2}; @@ -188,50 +188,51 @@ void CompFaceIntegrals(const FaceType &f) It requires a watertight mesh with per face normals. */ -void Compute(MeshType &m) +void Compute(const MeshType &m) { - tri::UpdateNormal::PerFaceNormalized(m); - double nx, ny, nz; + double nx, ny, nz; - T0 = T1[X] = T1[Y] = T1[Z] - = T2[X] = T2[Y] = T2[Z] - = TP[X] = TP[Y] = TP[Z] = 0; - for (auto fi=m.face.begin(); fi!=m.face.end();++fi) if(!(*fi).IsD() && vcg::DoubleArea(*fi)>std::numeric_limits::min()) { - const FaceType &f=(*fi); + T0 = T1[X] = T1[Y] = T1[Z] + = T2[X] = T2[Y] = T2[Z] + = TP[X] = TP[Y] = TP[Z] = 0; + for (auto fi=m.face.begin(); fi!=m.face.end();++fi) if(!(*fi).IsD() && vcg::DoubleArea(*fi)>std::numeric_limits::min()) + { + const FaceType &f=(*fi); + const auto fn = vcg::NormalizedTriangleNormal(f); - nx = fabs(f.N()[0]); - ny = fabs(f.N()[1]); - nz = fabs(f.N()[2]); - if (nx > ny && nx > nz) C = X; - else C = (ny > nz) ? Y : Z; - A = (C + 1) % 3; - B = (A + 1) % 3; + nx = fabs(fn[0]); + ny = fabs(fn[1]); + nz = fabs(fn[2]); + if (nx > ny && nx > nz) C = X; + else C = (ny > nz) ? Y : Z; + A = (C + 1) % 3; + B = (A + 1) % 3; - CompFaceIntegrals(f); + CompFaceIntegrals(f); - T0 += f.N()[X] * ((A == X) ? Fa : ((B == X) ? Fb : Fc)); + T0 += fn[X] * ((A == X) ? Fa : ((B == X) ? Fb : Fc)); - T1[A] += f.N()[A] * Faa; - T1[B] += f.N()[B] * Fbb; - T1[C] += f.N()[C] * Fcc; - T2[A] += f.N()[A] * Faaa; - T2[B] += f.N()[B] * Fbbb; - T2[C] += f.N()[C] * Fccc; - TP[A] += f.N()[A] * Faab; - TP[B] += f.N()[B] * Fbbc; - TP[C] += f.N()[C] * Fcca; - } + T1[A] += fn[A] * Faa; + T1[B] += fn[B] * Fbb; + T1[C] += fn[C] * Fcc; + T2[A] += fn[A] * Faaa; + T2[B] += fn[B] * Fbbb; + T2[C] += fn[C] * Fccc; + TP[A] += fn[A] * Faab; + TP[B] += fn[B] * Fbbc; + TP[C] += fn[C] * Fcca; + } - T1[X] /= 2; T1[Y] /= 2; T1[Z] /= 2; - T2[X] /= 3; T2[Y] /= 3; T2[Z] /= 3; - TP[X] /= 2; TP[Y] /= 2; TP[Z] /= 2; + T1[X] /= 2; T1[Y] /= 2; T1[Z] /= 2; + T2[X] /= 3; T2[Y] /= 3; T2[Z] /= 3; + TP[X] /= 2; TP[Y] /= 2; TP[Z] /= 2; } /*! \brief Return the Volume (or mass) of the mesh. Meaningful only if the mesh is watertight. */ -ScalarType Mass() +ScalarType Mass(void) const { return static_cast(T0); } @@ -240,15 +241,17 @@ ScalarType Mass() Meaningful only if the mesh is watertight. */ -Point3 CenterOfMass() +Point3 CenterOfMass(void) const { - Point3 r; - r[X] = T1[X] / T0; - r[Y] = T1[Y] / T0; - r[Z] = T1[Z] / T0; - return r; + Point3 r; + r[X] = T1[X] / T0; + r[Y] = T1[Y] / T0; + r[Z] = T1[Z] / T0; + return r; } -void InertiaTensor(Matrix33 &J ){ + +void InertiaTensor(Matrix33 &J) const +{ Point3 r; r[X] = T1[X] / T0; r[Y] = T1[Y] / T0; @@ -270,27 +273,27 @@ void InertiaTensor(Matrix33 &J ){ } //void InertiaTensor(Matrix44 &J ) -void InertiaTensor(Eigen::Matrix3d &J ) +void InertiaTensor(Eigen::Matrix3d &J) const { - J=Eigen::Matrix3d::Identity(); - Point3d r; - r[X] = T1[X] / T0; - r[Y] = T1[Y] / T0; - r[Z] = T1[Z] / T0; - /* compute inertia tensor */ - J(X,X) = (T2[Y] + T2[Z]); - J(Y,Y) = (T2[Z] + T2[X]); - J(Z,Z) = (T2[X] + T2[Y]); - J(X,Y) = J(Y,X) = - TP[X]; - J(Y,Z) = J(Z,Y) = - TP[Y]; - J(Z,X) = J(X,Z) = - TP[Z]; + J=Eigen::Matrix3d::Identity(); + Point3d r; + r[X] = T1[X] / T0; + r[Y] = T1[Y] / T0; + r[Z] = T1[Z] / T0; + /* compute inertia tensor */ + J(X,X) = (T2[Y] + T2[Z]); + J(Y,Y) = (T2[Z] + T2[X]); + J(Z,Z) = (T2[X] + T2[Y]); + J(X,Y) = J(Y,X) = - TP[X]; + J(Y,Z) = J(Z,Y) = - TP[Y]; + J(Z,X) = J(X,Z) = - TP[Z]; - J(X,X) -= T0 * (r[Y]*r[Y] + r[Z]*r[Z]); - J(Y,Y) -= T0 * (r[Z]*r[Z] + r[X]*r[X]); - J(Z,Z) -= T0 * (r[X]*r[X] + r[Y]*r[Y]); - J(X,Y) = J(Y,X) += T0 * r[X] * r[Y]; - J(Y,Z) = J(Z,Y) += T0 * r[Y] * r[Z]; - J(Z,X) = J(X,Z) += T0 * r[Z] * r[X]; + J(X,X) -= T0 * (r[Y]*r[Y] + r[Z]*r[Z]); + J(Y,Y) -= T0 * (r[Z]*r[Z] + r[X]*r[X]); + J(Z,Z) -= T0 * (r[X]*r[X] + r[Y]*r[Y]); + J(X,Y) = J(Y,X) += T0 * r[X] * r[Y]; + J(Y,Z) = J(Z,Y) += T0 * r[Y] * r[Z]; + J(Z,X) = J(X,Z) += T0 * r[Z] * r[X]; } @@ -299,7 +302,7 @@ void InertiaTensor(Eigen::Matrix3d &J ) The result is factored as eigenvalues and eigenvectors (as ROWS). */ -void InertiaTensorEigen(Matrix33 &EV, Point3 &ev ) +void InertiaTensorEigen(Matrix33 &EV, Point3 &ev) const { Eigen::Matrix3d it; InertiaTensor(it); @@ -376,7 +379,7 @@ static void Covariance(const MeshType & m, vcg::Point3 & bary, vcg:: } }; // end class Inertia - } // end namespace tri +} // end namespace tri } // end namespace vcg diff --git a/vcg/complex/algorithms/stat.h b/vcg/complex/algorithms/stat.h index 14a6d8fd..59ccaed7 100644 --- a/vcg/complex/algorithms/stat.h +++ b/vcg/complex/algorithms/stat.h @@ -239,18 +239,18 @@ public: return barycenter/areaSum; } - static ScalarType ComputeTetraMeshVolume(MeshType & m) + static ScalarType ComputeTetraMeshVolume(const MeshType & m) { ScalarType V = 0; - ForEachTetra(m, [&V] (TetraType & t) { + ForEachTetra(m, [&V] (const TetraType & t) { V += Tetra::ComputeVolume(t); }); return V; } - static ScalarType ComputeMeshVolume(MeshType & m) + static ScalarType ComputeMeshVolume(const MeshType & m) { Inertia I(m); return I.Mass(); From 50f1d8961eea0db3803ace49af6aef0f89ecca91 Mon Sep 17 00:00:00 2001 From: Luigi Malomo Date: Wed, 17 Nov 2021 15:34:35 +0100 Subject: [PATCH 06/38] bugfix in Inertia Compute --- vcg/complex/algorithms/inertia.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/vcg/complex/algorithms/inertia.h b/vcg/complex/algorithms/inertia.h index c68c22fa..26cecb20 100644 --- a/vcg/complex/algorithms/inertia.h +++ b/vcg/complex/algorithms/inertia.h @@ -66,8 +66,8 @@ class Inertia private : enum {X=0,Y=1,Z=2}; - inline ScalarType SQR(ScalarType &x) const { return x*x;} - inline ScalarType CUBE(ScalarType &x) const { return x*x*x;} + inline ScalarType SQR(const ScalarType &x) const { return x*x;} + inline ScalarType CUBE(const ScalarType &x) const { return x*x*x;} int A; /* alpha */ int B; /* beta */ @@ -148,15 +148,13 @@ public: } -void CompFaceIntegrals(const FaceType &f) +void CompFaceIntegrals(const FaceType &f, const Point3 &n) { - Point3 n; ScalarType w; - double k1, k2, k3, k4; + double k1, k2, k3, k4; compProjectionIntegrals(f); - n = f.N(); w = -f.V(0)->P()*n; k1 = 1 / n[C]; k2 = k1 * k1; k3 = k2 * k1; k4 = k3 * k1; @@ -208,7 +206,7 @@ void Compute(const MeshType &m) A = (C + 1) % 3; B = (A + 1) % 3; - CompFaceIntegrals(f); + CompFaceIntegrals(f, fn); T0 += fn[X] * ((A == X) ? Fa : ((B == X) ? Fb : Fc)); From 1fa21d2a6414a2c42620661498a330713218a3f9 Mon Sep 17 00:00:00 2001 From: Luigi Malomo Date: Wed, 17 Nov 2021 15:35:19 +0100 Subject: [PATCH 07/38] more const correctness in Stat --- vcg/complex/algorithms/stat.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/vcg/complex/algorithms/stat.h b/vcg/complex/algorithms/stat.h index 59ccaed7..40eca014 100644 --- a/vcg/complex/algorithms/stat.h +++ b/vcg/complex/algorithms/stat.h @@ -99,7 +99,7 @@ public: static std::pair ComputePerFaceQualityMinMax( const MeshType & m) { tri::RequirePerFaceQuality(m); - std::pair minmax = std::make_pair(std::numeric_limits::max(),-std::numeric_limits::max()); + std::pair minmax = std::make_pair(std::numeric_limits::max(),-std::numeric_limits::lowest()); ConstFaceIterator fi; for(fi = m.face.begin(); fi != m.face.end(); ++fi) @@ -119,26 +119,26 @@ public: maxQ = minmax.second; } - static std::pair ComputePerTetraQualityMinMax(MeshType & m) + static std::pair ComputePerTetraQualityMinMax(const MeshType & m) { tri::RequirePerTetraQuality(m); - std::pair minmax = std::make_pair(std::numeric_limits::max(), std::numeric_limits::min()); + std::pair minmax = std::make_pair(std::numeric_limits::max(), std::numeric_limits::lowest()); - ForEachTetra(m, [&minmax] (TetraType & t) { - if (t.Q() < minmax.first) minmax.first = t.Q(); - if (t.Q() > minmax.second) minmax.second = t.Q(); + ForEachTetra(m, [&minmax] (const TetraType & t) { + if (t.cQ() < minmax.first) minmax.first = t.cQ(); + if (t.cQ() > minmax.second) minmax.second = t.cQ(); }); return minmax; } - static ScalarType ComputePerTetraQualityAvg(MeshType & m) + static ScalarType ComputePerTetraQualityAvg(const MeshType & m) { tri::RequirePerTetraQuality(m); ScalarType avgQ = 0; - ForEachTetra(m, [&avgQ] (TetraType & t) { - avgQ += t.Q(); + ForEachTetra(m, [&avgQ] (const TetraType & t) { + avgQ += t.cQ(); }); return avgQ /= (ScalarType) m.TN(); @@ -176,17 +176,17 @@ public: return (AvgQ/(ScalarType)num); } - static std::pair ComputePerEdgeQualityMinMax( MeshType & m) + static std::pair ComputePerEdgeQualityMinMax(const MeshType & m) { tri::RequirePerEdgeQuality(m); - std::pair minmax = std::make_pair(std::numeric_limits::max(),-std::numeric_limits::max()); + std::pair minmax = std::make_pair(std::numeric_limits::max(),std::numeric_limits::lowest()); EdgeIterator ei; for(ei = m.edge.begin(); ei != m.edge.end(); ++ei) if(!(*ei).IsD()) { - if( (*ei).Q() < minmax.first) minmax.first =(*ei).Q(); - if( (*ei).Q() > minmax.second) minmax.second=(*ei).Q(); + if( (*ei).cQ() < minmax.first) minmax.first =(*ei).cQ(); + if( (*ei).cQ() > minmax.second) minmax.second=(*ei).cQ(); } return minmax; } From 2c1279f880a514fe4189db260e53cd9fad19f61b Mon Sep 17 00:00:00 2001 From: Luigi Malomo Date: Wed, 17 Nov 2021 15:37:07 +0100 Subject: [PATCH 08/38] static assert for Eigen type conversion --- vcg/space/deprecated_point3.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vcg/space/deprecated_point3.h b/vcg/space/deprecated_point3.h index 81e67862..2bfac5e9 100644 --- a/vcg/space/deprecated_point3.h +++ b/vcg/space/deprecated_point3.h @@ -202,7 +202,8 @@ public: template inline EigenVector ToEigenVector(void) const { - assert(EigenVector::RowsAtCompileTime == 3 || EigenVector::RowsAtCompileTime == 4); + static_assert(EigenVector::RowsAtCompileTime == 3 || EigenVector::RowsAtCompileTime == 4, + "EigenVector type has not 3 or 4 components"); EigenVector b = EigenVector::Zero(); b[0]=_v[0]; b[1]=_v[1]; From 22b9044222a7e49602a6e4985aa52ecada42f813 Mon Sep 17 00:00:00 2001 From: Luigi Malomo Date: Wed, 17 Nov 2021 15:43:42 +0100 Subject: [PATCH 09/38] cleaning --- vcg/complex/algorithms/geodesic.h | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/vcg/complex/algorithms/geodesic.h b/vcg/complex/algorithms/geodesic.h index 40436918..bb745456 100644 --- a/vcg/complex/algorithms/geodesic.h +++ b/vcg/complex/algorithms/geodesic.h @@ -207,18 +207,12 @@ public: typedef SimpleTempData, TempData > TempDataType; - class pred { - public: - pred () {}; - bool operator()(const VertDist& v0, const VertDist& v1) const - {return (v0.d > v1.d);} - }; - - //struct pred: public std::binary_function{ - // pred(){} - // bool operator()(const VertDist& v0, const VertDist& v1) const - // {return (v0.d > v1.d);} - //}; + struct pred { + pred() {}; + bool operator()(const VertDist& v0, const VertDist& v1) const { + return (v0.d > v1.d); + } + }; /* * From 090e0e438e8bd98f32be1cf41ee99811b805b90d Mon Sep 17 00:00:00 2001 From: alemuntoni Date: Wed, 17 Nov 2021 16:42:24 +0100 Subject: [PATCH 10/38] removing using namespace std from header files --- wrap/gcache/cache.h | 4 ++-- wrap/gui/frustum.h | 2 +- wrap/gui/trackmode.cpp | 10 +++++----- wrap/gui/trackutils.h | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/wrap/gcache/cache.h b/wrap/gcache/cache.h index f25ab1ea..f790690e 100644 --- a/wrap/gcache/cache.h +++ b/wrap/gcache/cache.h @@ -26,7 +26,7 @@ typedef unsigned __int64 uint64_t; #include "provider.h" -using namespace std; +//using namespace std; /* this cache system enforce the rule that the items in a cache are always in all the cache below */ /* two mechanism to remove tokens from the cache: 1) set token count to something low @@ -169,7 +169,7 @@ protected: if(unload() || load()) { new_data.testAndSetOrdered(0, 1); //if not changed, set as changed input->check_queue.open(); //we signal ourselves to check again - cout << "loaded or unloaded\n"; + std::cout << "loaded or unloaded\n"; } input->check_queue.leave(); } diff --git a/wrap/gui/frustum.h b/wrap/gui/frustum.h index 49a3f6af..8f6108ca 100644 --- a/wrap/gui/frustum.h +++ b/wrap/gui/frustum.h @@ -59,7 +59,7 @@ Adding copyright. #include #include -using namespace std; +//using namespace std; namespace vcg { diff --git a/wrap/gui/trackmode.cpp b/wrap/gui/trackmode.cpp index 7365c764..5c2d29f5 100644 --- a/wrap/gui/trackmode.cpp +++ b/wrap/gui/trackmode.cpp @@ -82,7 +82,7 @@ void SphereMode::Apply (Trackball * tb, Point3f new_point) // Figure out how much to rotate around that axis. // float phi = Distance (hitNew, hitOld) / tb->radius; // float phi = vcg::Angle(hitNew - center,hitOld - center)*(Distance(hitNew,center)/tb->radius); - float phi = max(vcg::Angle(hitNew - center,hitOld - center),(Distance(hitNew,hitOld)/tb->radius)) ; + float phi = std::max(vcg::Angle(hitNew - center,hitOld - center),(Distance(hitNew,hitOld)/tb->radius)) ; tb->track.rot = Quaternionf (-phi, axis) * tb->last_track.rot; } @@ -128,12 +128,12 @@ void ZMode::Draw(Trackball * tb){ // Scale mode implementation. void ScaleMode::Apply (Trackball * tb, float WheelNotch) { - tb->track.sca *= pow (1.2f, -WheelNotch); + tb->track.sca *= std::pow (1.2f, -WheelNotch); } void ScaleMode::Apply (Trackball * tb, Point3f new_point) { - tb->track.sca = tb->last_track.sca * pow (3.0f, -(getDeltaY(tb,new_point))); + tb->track.sca = tb->last_track.sca * std::pow (3.0f, -(getDeltaY(tb,new_point))); } void ScaleMode::Draw(Trackball * tb){ @@ -787,7 +787,7 @@ void NavigatorWasdMode::Animate(unsigned int msec, Trackball * tb){ float vel = current_speed_h.Norm(); if (veltrack.tra[1]+=step_last; diff --git a/wrap/gui/trackutils.h b/wrap/gui/trackutils.h index 4e3d1ede..6df38318 100644 --- a/wrap/gui/trackutils.h +++ b/wrap/gui/trackutils.h @@ -35,7 +35,7 @@ #include #include #include -using namespace std; +//using namespace std; namespace vcg { From 6b44e8e23ecabcaf01f8dfdbdcfaa30d28f8f742 Mon Sep 17 00:00:00 2001 From: alemuntoni Date: Wed, 17 Nov 2021 16:57:23 +0100 Subject: [PATCH 11/38] removing using namespace std from header files --- wrap/gcache/cache.h | 2 +- wrap/gcache/controller.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/wrap/gcache/cache.h b/wrap/gcache/cache.h index f790690e..713e7a41 100644 --- a/wrap/gcache/cache.h +++ b/wrap/gcache/cache.h @@ -212,7 +212,7 @@ protected: } else { //last item is locked need to reorder stack remove = this->heap.popMin(); this->heap.push(remove); - cout << "Reordering stack something (what?)\n"; + std::cout << "Reordering stack something (what?)\n"; return true; } } diff --git a/wrap/gcache/controller.h b/wrap/gcache/controller.h index 7c974887..ffe54903 100644 --- a/wrap/gcache/controller.h +++ b/wrap/gcache/controller.h @@ -140,7 +140,7 @@ class Controller { void resume() { assert(!stopped); assert(paused); - cout << "Resume" << endl; + std::cout << "Resume" << std::endl; //unlock and open all doors for(unsigned int i = 0; i < caches.size(); i++) { From 8de019c1e3ca1b982c11ba8815dd17638cb1f0af Mon Sep 17 00:00:00 2001 From: alemuntoni Date: Wed, 17 Nov 2021 17:39:01 +0100 Subject: [PATCH 12/38] removing using namespace std from header files --- vcg/complex/algorithms/create/platonic.h | 2 +- .../create/plymc/simplemeshprovider.h | 2 +- vcg/complex/algorithms/isotropic_remeshing.h | 16 +- vcg/complex/algorithms/overlap_estimation.h | 25 +- vcg/complex/algorithms/pointcloud_normal.h | 2 +- vcg/complex/algorithms/smooth.h | 10 +- vcg/complex/algorithms/update/fitmaps.h | 3 - .../algorithms/update/halfedge_topology.h | 226 +++++++++--------- vcg/complex/algorithms/voronoi_processing.h | 20 +- .../algorithms/voronoi_volume_sampling.h | 4 +- 10 files changed, 150 insertions(+), 160 deletions(-) diff --git a/vcg/complex/algorithms/create/platonic.h b/vcg/complex/algorithms/create/platonic.h index 386bcff1..6d1573db 100644 --- a/vcg/complex/algorithms/create/platonic.h +++ b/vcg/complex/algorithms/create/platonic.h @@ -1125,7 +1125,7 @@ struct _SphUsedTypes : public UsedTypes< Use<_SphVertex> ::AsVertexType, class _SphVertex : public Vertex<_SphUsedTypes, vertex::Coord3f, vertex::Normal3f, vertex::BitFlags >{}; class _SphFace : public Face< _SphUsedTypes, face::VertexRef, face::Normal3f, face::BitFlags, face::FFAdj > {}; -class _SphMesh : public tri::TriMesh< vector<_SphVertex>, vector<_SphFace> > {}; +class _SphMesh : public tri::TriMesh< std::vector<_SphVertex>, std::vector<_SphFace> > {}; template diff --git a/vcg/complex/algorithms/create/plymc/simplemeshprovider.h b/vcg/complex/algorithms/create/plymc/simplemeshprovider.h index 87d20baf..180e4fa5 100644 --- a/vcg/complex/algorithms/create/plymc/simplemeshprovider.h +++ b/vcg/complex/algorithms/create/plymc/simplemeshprovider.h @@ -147,7 +147,7 @@ public: bool openALN (const char* alnName) { - vector rmaps; + std::vector rmaps; ALNParser::ParseALN(rmaps, alnName); for(size_t i=0; i ff; + std::vector ff; face::VFExtendedStarVF(&*vi, 2, ff); ScalarType tot = 0.f; @@ -906,8 +906,8 @@ private: p1.FlipV(); - vector vi0, vi1; - vector ff0, ff1; + std::vector vi0, vi1; + std::vector ff0, ff1; face::VFStarVF(p0.V(), ff0, vi0); face::VFStarVF(p1.V(), ff1, vi1); @@ -956,7 +956,7 @@ private: Point3 collapseNV, collapsedNV0, collapsedNV1; collapseNV = (p.V()->P() - p.VFlip()->P()).normalized(); - vector vv; + std::vector vv; face::VVStarVF(p.V(), vv); for(VertexType *v: vv) @@ -1052,9 +1052,9 @@ private: * \ | / \|/ +0 \ / -1 * v3 v3 v3 */ - static bool chooseBestCrossCollapse(PosType &p, VertexPair& bp, vector &ff) + static bool chooseBestCrossCollapse(PosType &p, VertexPair& bp, std::vector &ff) { - vector vv0, vv1, vv2, vv3; + std::vector vv0, vv1, vv2, vv3; VertexType *v0, *v1, *v2, *v3; v0 = p.F()->V1(p.VInd()); @@ -1175,8 +1175,8 @@ private: PosType pi(&*fi, i); if(!pi.V()->IsB()) { - vector ff; - vector vi; + std::vector ff; + std::vector vi; face::VFStarVF(pi.V(), ff, vi); //if cross need to check what creases you have and decide where to collapse accordingly diff --git a/vcg/complex/algorithms/overlap_estimation.h b/vcg/complex/algorithms/overlap_estimation.h index 1f75d786..9efc88a4 100644 --- a/vcg/complex/algorithms/overlap_estimation.h +++ b/vcg/complex/algorithms/overlap_estimation.h @@ -32,9 +32,6 @@ #include -using namespace std; -using namespace vcg; - /** \brief This class provides a strategy to estimate the overlap percentage of two range maps/point clouds. * * This class can be used, for exemple, into an automatic alignment process to check the quality of the @@ -56,9 +53,9 @@ template class OverlapEstimation typedef typename MeshType::FaceType FaceType; typedef typename MeshType::VertexPointer VertexPointer; typedef typename MeshType::VertexIterator VertexIterator; - typedef typename vector::iterator VertexPointerIterator; - typedef GridStaticPtr MeshGrid; - typedef tri::EmptyTMark MarkerVertex; + typedef typename std::vector::iterator VertexPointerIterator; + typedef vcg::GridStaticPtr MeshGrid; + typedef vcg::tri::EmptyTMark MarkerVertex; private: /** Private simple class needed to perform sampling of pointers to vertexes. */ @@ -70,7 +67,7 @@ template class OverlapEstimation VertexPointerSampler(){ m = new MeshType(); m->Tr.SetIdentity(); m->sfn=0; } ~VertexPointerSampler(){ if(m) delete m; } - vector sampleVec; + std::vector sampleVec; void AddVert(VertexType &p){ sampleVec.push_back(&p); } //this function is the only we really need void AddFace(const FaceType &f, const CoordType &p){} @@ -110,7 +107,7 @@ template class OverlapEstimation private: MeshType* mFix; /** Pointer to mesh \c mFix. */ MeshType* mMov; /** Pointer to mesh \c mMov. */ - vector >* normBuckets; //structure to hold normals bucketing. Needed for normal equalized sampling during consensus + std::vector >* normBuckets; //structure to hold normals bucketing. Needed for normal equalized sampling during consensus MeshGrid* gridFix; //variable to manage uniform grid MarkerVertex markerFunctorFix; //variable to manage uniform grid @@ -182,7 +179,7 @@ template class OverlapEstimation //if no buckets are provided get a vector of vertex pointers sampled uniformly //else, get a vector of vertex pointers sampled in a normal equalized manner; used as query points - vector queryVert; + std::vector queryVert; if(param.normalEqualization){ assert(normBuckets); for(unsigned int i=0; ivert.size(); i++) queryVert.push_back(&(mMov->vert[i]));//do a copy of pointers to vertexes @@ -249,7 +246,7 @@ template class OverlapEstimation * @param vert Destination vector. * @param sampleNum Requested number of vertexes. */ - void SampleVertUniform(MESH_TYPE& m, vector& vert, int sampleNum) + void SampleVertUniform(MESH_TYPE& m, std::vector& vert, int sampleNum) { VertexPointerSampler sampler; tri::SurfaceSampling::VertexUniform(m, sampler, sampleNum); @@ -258,12 +255,12 @@ template class OverlapEstimation /** Buckets normals of the vertexes contained in \c vert . * \return A vector of vectors containing indexes to \c vert . */ - vector >* BucketVertexNormal(typename MESH_TYPE::VertContainer& vert, int bucketDim = 30) + std::vector >* BucketVertexNormal(typename MESH_TYPE::VertContainer& vert, int bucketDim = 30) { static vector NV; if(NV.size()==0) GenNormal::Uniform(bucketDim,NV); - vector >* BKT = new vector >(NV.size()); //NV size is greater then bucketDim, so don't change this! + std::vector >* BKT = new std::vector >(NV.size()); //NV size is greater then bucketDim, so don't change this! int ind; for(int i=0;i class OverlapEstimation { assert(normBuckets); // vettore di contatori per sapere quanti punti ho gia' preso per ogni bucket - vector BKTpos(normBuckets->size(),0); + std::vector BKTpos(normBuckets->size(),0); if(SampleNum >= int(vert.size())) SampleNum= int(vert.size()-1); @@ -289,7 +286,7 @@ template class OverlapEstimation for(int i=0;isize()); // Scelgo un Bucket int &CURpos = BKTpos[ind]; - vector &CUR = (*normBuckets)[ind]; + std::vector &CUR = (*normBuckets)[ind]; if(CURpos::PriorityQueue nq; for (VertexIterator vi=m.vert.begin();vi!=m.vert.end();++vi) { diff --git a/vcg/complex/algorithms/smooth.h b/vcg/complex/algorithms/smooth.h index d2e4dbdf..7b568135 100644 --- a/vcg/complex/algorithms/smooth.h +++ b/vcg/complex/algorithms/smooth.h @@ -1417,7 +1417,7 @@ A(...) (2-2nm) = // move vertex vp->P() = ct; - vector faces2 = HalfEdgeTopology::get_incident_faces(vp); + std::vector faces2 = HalfEdgeTopology::get_incident_faces(vp); // estimate normal typename MeshType::CoordType avgn(0, 0, 0); @@ -1425,7 +1425,7 @@ A(...) (2-2nm) = for (unsigned int i = 0; i < faces2.size(); i++) if (faces2[i]) { - vector vertices = HalfEdgeTopology::getVertices(faces2[i]); + std::vector vertices = HalfEdgeTopology::getVertices(faces2[i]); assert(vertices.size() == 4); @@ -1444,9 +1444,9 @@ A(...) (2-2nm) = typename MeshTypeTri::FaceType *f = 0; typename MeshTypeTri::FaceType *fr = 0; - vector closests; - vector minDists; - vector faces; + std::vector closests; + std::vector minDists; + std::vector faces; ray.SetDirection(-raydir); f = vcg::tri::DoRay(gridmesh, grid, ray, diag / 4.0, t); diff --git a/vcg/complex/algorithms/update/fitmaps.h b/vcg/complex/algorithms/update/fitmaps.h index d1598640..917e2e57 100644 --- a/vcg/complex/algorithms/update/fitmaps.h +++ b/vcg/complex/algorithms/update/fitmaps.h @@ -42,9 +42,6 @@ #include - -using namespace Eigen; - namespace vcg { namespace tri { template diff --git a/vcg/complex/algorithms/update/halfedge_topology.h b/vcg/complex/algorithms/update/halfedge_topology.h index 09138600..25dc9787 100644 --- a/vcg/complex/algorithms/update/halfedge_topology.h +++ b/vcg/complex/algorithms/update/halfedge_topology.h @@ -25,10 +25,6 @@ #include -using namespace std; -using namespace vcg::hedge; -using namespace vcg::tri; - namespace vcg { namespace tri @@ -123,7 +119,7 @@ namespace vcg HEdgePointer hp; - vector vps = getVertices(fp); + std::vector vps = getVertices(fp); assert(vps.size()==4); @@ -138,7 +134,7 @@ namespace vcg while(hp->HVp() != vp) hp= hp->HNp(); - vector hps = getHEdges(fp,hp); + std::vector hps = getHEdges(fp,hp); assert(vp == hps[0]->HVp()); @@ -195,10 +191,10 @@ namespace vcg if(b[j]) { if(HasHE) - Allocator::DeleteEdge(m, *(hps[i]->HEp()) ); + vcg::tri::Allocator::DeleteEdge(m, *(hps[i]->HEp()) ); - Allocator::DeleteHEdge(m, *(hps[i]->HOp()) ); - Allocator::DeleteHEdge(m, *(hps[i+1]->HOp()) ); + vcg::tri::Allocator::DeleteHEdge(m, *(hps[i]->HOp()) ); + vcg::tri::Allocator::DeleteHEdge(m, *(hps[i+1]->HOp()) ); hps[i+1]->HVp()->VHp() = NULL; @@ -218,17 +214,17 @@ namespace vcg } - Allocator::DeleteFace(m, *(fp) ); - Allocator::DeleteVertex(m, *(opposite_vertex) ); + vcg::tri::Allocator::DeleteFace(m, *(fp) ); + vcg::tri::Allocator::DeleteVertex(m, *(opposite_vertex) ); if(HasHE) { - Allocator::DeleteEdge(m, *(hps[1]->HEp()) ); - Allocator::DeleteEdge(m, *(hps[3]->HEp()) ); + vcg::tri::Allocator::DeleteEdge(m, *(hps[1]->HEp()) ); + vcg::tri::Allocator::DeleteEdge(m, *(hps[3]->HEp()) ); } - Allocator::DeleteHEdge(m, *(hps[0]) ); - Allocator::DeleteHEdge(m, *(hps[1]) ); - Allocator::DeleteHEdge(m, *(hps[2]) ); - Allocator::DeleteHEdge(m, *(hps[3]) ); + vcg::tri::Allocator::DeleteHEdge(m, *(hps[0]) ); + vcg::tri::Allocator::DeleteHEdge(m, *(hps[1]) ); + vcg::tri::Allocator::DeleteHEdge(m, *(hps[2]) ); + vcg::tri::Allocator::DeleteHEdge(m, *(hps[3]) ); return vp; @@ -288,17 +284,17 @@ namespace vcg fp1->FHp() = hp->HNp(); - Allocator::DeleteVertex(m, *vp); + vcg::tri::Allocator::DeleteVertex(m, *vp); if(MeshType::HEdgeType::HasHEAdjacency()) { - Allocator::DeleteEdge(m, *(hp->HEp()) ); - Allocator::DeleteEdge(m, *(hp->HPp()->HEp()) ); + vcg::tri::Allocator::DeleteEdge(m, *(hp->HEp()) ); + vcg::tri::Allocator::DeleteEdge(m, *(hp->HPp()->HEp()) ); } - Allocator::DeleteHEdge(m, *hp ); - Allocator::DeleteHEdge(m, *(hp->HOp()) ); - Allocator::DeleteHEdge(m, *(hp->HPp()) ); - Allocator::DeleteHEdge(m, *(hp->HPp()->HOp()) ); - Allocator::DeleteFace(m, *fp2 ); + vcg::tri::Allocator::DeleteHEdge(m, *hp ); + vcg::tri::Allocator::DeleteHEdge(m, *(hp->HOp()) ); + vcg::tri::Allocator::DeleteHEdge(m, *(hp->HPp()) ); + vcg::tri::Allocator::DeleteHEdge(m, *(hp->HPp()->HOp()) ); + vcg::tri::Allocator::DeleteFace(m, *fp2 ); return fp1; @@ -335,13 +331,13 @@ namespace vcg bool HasHE = MeshType::HEdgeType::HasHEAdjacency(); bool HasEH = MeshType::EdgeType::HasEHAdjacency(); - vector ext_hedges; + std::vector ext_hedges; - vector int_hedges = getHEdges(fp); + std::vector int_hedges = getHEdges(fp); - Allocator::DeleteFace( m, *(fp) ); + vcg::tri::Allocator::DeleteFace( m, *(fp) ); - for(typename vector::iterator hi = int_hedges.begin(); hi != int_hedges.end();++hi) + for(typename std::vector::iterator hi = int_hedges.begin(); hi != int_hedges.end();++hi) { if((*hi)->HOp()->HFp() != fp) @@ -349,15 +345,15 @@ namespace vcg else if(vertex_valence((*hi)->HVp()) == 1) { - Allocator::DeleteVertex( m, *((*hi)->HVp()) ); + vcg::tri::Allocator::DeleteVertex( m, *((*hi)->HVp()) ); if(HasHE) - Allocator::DeleteEdge( m, *((*hi)->HEp()) ); + vcg::tri::Allocator::DeleteEdge( m, *((*hi)->HEp()) ); } } - for(typename vector::iterator hi = int_hedges.begin(); hi != int_hedges.end();++hi) - Allocator::DeleteHEdge( m, *(*hi) ); + for(typename std::vector::iterator hi = int_hedges.begin(); hi != int_hedges.end();++hi) + vcg::tri::Allocator::DeleteHEdge( m, *(*hi) ); assert(ext_hedges.size() == 2); @@ -369,7 +365,7 @@ namespace vcg if(HasHE) { - Allocator::DeleteEdge( m, *(ext_hedges[1]->HEp()) ); + vcg::tri::Allocator::DeleteEdge( m, *(ext_hedges[1]->HEp()) ); ext_hedges[1]->HEp() = ext_hedges[0]->HEp(); @@ -390,11 +386,11 @@ namespace vcg if(HasHE) { - Allocator::DeleteEdge( m, *( ext_hedges[0]->HEp()) ); - Allocator::DeleteEdge( m, *( ext_hedges[1]->HEp()) ); + vcg::tri::Allocator::DeleteEdge( m, *( ext_hedges[0]->HEp()) ); + vcg::tri::Allocator::DeleteEdge( m, *( ext_hedges[1]->HEp()) ); } - Allocator::DeleteHEdge( m, *( ext_hedges[0]) ); - Allocator::DeleteHEdge( m, *( ext_hedges[1]) ); + vcg::tri::Allocator::DeleteHEdge( m, *( ext_hedges[0]) ); + vcg::tri::Allocator::DeleteHEdge( m, *( ext_hedges[1]) ); return NULL; } @@ -434,9 +430,9 @@ namespace vcg assert(!has_doublet_quad(fp1)); assert(!has_doublet_quad(fp2)); - vector fps; - typedef vector hedge_vect; - vector hps; + std::vector fps; + typedef std::vector hedge_vect; + std::vector hps; fps.push_back(fp1); fps.push_back(fp2); @@ -510,12 +506,12 @@ namespace vcg assert(MeshType::VertexType::HasVHAdjacency()); assert( vp->VHp() ); - Pos p(vp->VHp(), true); + vcg::hedge::Pos p(vp->VHp(), true); HEdgePointer hep = p.HE(); - typedef vector hedge_vect; - vector hedges; + typedef std::vector hedge_vect; + std::vector hedges; do { @@ -621,10 +617,10 @@ namespace vcg vp1->VHp() = hopp->HNp(); if(HasHEAdjacency(m)) - Allocator::DeleteEdge(m,*(hp->HEp())); - Allocator::DeleteHEdge(m,*hp); - Allocator::DeleteHEdge(m,*hopp); - Allocator::DeleteVertex(m,*vp); + vcg::tri::Allocator::DeleteEdge(m,*(hp->HEp())); + vcg::tri::Allocator::DeleteHEdge(m,*hp); + vcg::tri::Allocator::DeleteHEdge(m,*hopp); + vcg::tri::Allocator::DeleteVertex(m,*vp); return vp1; @@ -638,7 +634,7 @@ namespace vcg * * \return Pointer to the new face if it has been inserted, NULL otherwise */ - static FacePointer add_face(MeshType &m, vector &vps) + static FacePointer add_face(MeshType &m, std::vector &vps) { assert(MeshType::VertexType::HasVHAdjacency()); @@ -657,13 +653,13 @@ namespace vcg assert( count(vps.begin(), vps.end(), vps[i]) == 1 ); } - vector hps; + std::vector hps; while(hps.size() < size) if( !can_add_hedge(vps, hps) ) return NULL; - vector non_manifold_vertices(size, false); + std::vector non_manifold_vertices(size, false); return add_face_unsafe( m,vps, hps, non_manifold_vertices); @@ -707,12 +703,12 @@ namespace vcg * * \return Pointer to the new face */ - static FacePointer add_face_unsafe(MeshType &m, vector &vps) + static FacePointer add_face_unsafe(MeshType &m, std::vector &vps) { unsigned int size = vps.size(); - vector hps; - vector non_manifold_vertices; + std::vector hps; + std::vector non_manifold_vertices; while(hps.size() < size) { @@ -736,7 +732,7 @@ namespace vcg * * \return Pointer to the new face */ - static FacePointer add_face_unsafe(MeshType &m, vector &vps, vector &hps, vector &non_manifold_vertices) + static FacePointer add_face_unsafe(MeshType &m, std::vector &vps, std::vector &hps, std::vector &non_manifold_vertices) { assert(MeshType::VertexType::HasVHAdjacency()); @@ -767,7 +763,7 @@ namespace vcg FacePointer fp; - FaceIterator fi = Allocator::AddFaces(m,1); + FaceIterator fi = vcg::tri::Allocator::AddFaces(m,1); (*fi).Alloc( size ); fp = &(*fi); @@ -780,12 +776,12 @@ namespace vcg if(HasEH || HasHE) { - ei = Allocator::AddEdges(m,edge_n); + ei = vcg::tri::Allocator::AddEdges(m,edge_n); for(EdgeIterator ei1 = ei; ei1 != m.edge.end(); ++ei1) (*ei1).SetD(); } - typename Allocator::template PointerUpdater pu; + typename vcg::tri::Allocator::template PointerUpdater pu; if(m.hedge.empty()) pu.oldBase = 0; @@ -795,7 +791,7 @@ namespace vcg pu.oldEnd = &m.hedge.back()+1; } - hi = Allocator::AddHEdges(m,2*edge_n); + hi = vcg::tri::Allocator::AddHEdges(m,2*edge_n); pu.newBase = &*(m.hedge.begin()); pu.newEnd = &m.hedge.back()+1; @@ -809,7 +805,7 @@ namespace vcg // update hedge pointers (if needed) if( pu.NeedUpdate() ) - for(typename vector::iterator hpsi = hps.begin(); hpsi != hps.end(); ++hpsi) + for(typename std::vector::iterator hpsi = hps.begin(); hpsi != hps.end(); ++hpsi) { if((*hpsi)) pu.Update(*hpsi); @@ -848,7 +844,7 @@ namespace vcg } } - vector hps1; + std::vector hps1; for(unsigned int i = 0; i < size; i++) { @@ -911,7 +907,7 @@ namespace vcg // after face insertion vertex will become non-manifold if(non_manifold_vertices[next]) { - Pos p(vps[next]->VHp(), true); + vcg::hedge::Pos p(vps[next]->VHp(), true); while(p.F()) { @@ -965,7 +961,7 @@ namespace vcg static void remove_face_unsafe (MeshType &m, FacePointer fp) { - vector hps = getHEdges(fp); + std::vector hps = getHEdges(fp); int size = hps.size(); @@ -991,11 +987,11 @@ namespace vcg } else { - Allocator::DeleteHEdge( m, *hps[i] ); - Allocator::DeleteHEdge( m, *(hps[i]->HOp()) ); + vcg::tri::Allocator::DeleteHEdge( m, *hps[i] ); + vcg::tri::Allocator::DeleteHEdge( m, *(hps[i]->HOp()) ); if(MeshType::HEdgeType::HasHEAdjacency()) - Allocator::DeleteEdge( m, *(hps[i]->HEp()) ); + vcg::tri::Allocator::DeleteEdge( m, *(hps[i]->HEp()) ); if( !hps[(i+size-1)%size]->HOp()->HFp() ) { @@ -1021,7 +1017,7 @@ namespace vcg { bool manifold = true; - Pos p(hps[i]->HVp()->VHp(), true); + vcg::hedge::Pos p(hps[i]->HVp()->VHp(), true); p.HE()->SetV(); @@ -1051,7 +1047,7 @@ namespace vcg } - Allocator::DeleteFace(m,*fp); + vcg::tri::Allocator::DeleteFace(m,*fp); } @@ -1065,7 +1061,7 @@ namespace vcg * \retval true if hedge can be inserted * \retval false otherwise */ - static bool can_add_hedge( vector &vps, vector &hps ) + static bool can_add_hedge( std::vector &vps, std::vector &hps ) { unsigned int i = hps.size(); @@ -1087,7 +1083,7 @@ namespace vcg unsigned int size = vps.size(); - Pos p(he, false); + vcg::hedge::Pos p(he, false); he->SetV(); @@ -1155,11 +1151,11 @@ namespace vcg assert(fp); assert(!fp->IsD()); - Pos p(fp->FHp(), true); + vcg::hedge::Pos p(fp->FHp(), true); do { - vector incident_faces = get_incident_faces( p.V() ); + std::vector incident_faces = get_incident_faces( p.V() ); unsigned int size = incident_faces.size(); @@ -1197,11 +1193,11 @@ namespace vcg assert(hp->HFp()->VN() == 4); assert(!hp->IsD()); - vector faces; + std::vector faces; HEdgePointer hopp = hp->HNp()->HNp(); - vector faces1 = get_incident_faces(hp->HVp(), hp); - vector faces2 = get_incident_faces(hp->HNp()->HNp()->HVp(), hopp); + std::vector faces1 = get_incident_faces(hp->HVp(), hp); + std::vector faces2 = get_incident_faces(hp->HNp()->HNp()->HVp(), hopp); faces.assign(faces1.begin()+1, faces1.end()); faces.assign(faces2.begin()+1, faces2.end()); @@ -1236,11 +1232,11 @@ namespace vcg // Second check - set set1; - set set2; + std::set set1; + std::set set2; - vector vect1 = getVertices(hp->HVp()); - vector vect2 = getVertices(hp->HNp()->HNp()->HVp()); + std::vector vect1 = getVertices(hp->HVp()); + std::vector vect2 = getVertices(hp->HNp()->HNp()->HVp()); set1.insert(vect1.begin(), vect1.end()); set2.insert(vect2.begin(), vect2.end()); @@ -1249,9 +1245,9 @@ namespace vcg if(vect2.size() < size) size = vect2.size(); - vector intersection(size); + std::vector intersection(size); - typename vector::iterator it; + typename std::vector::iterator it; it = set_intersection(set1.begin(), set1.end(), set2.begin(), set2.end(), intersection.begin()); size = it- intersection.begin(); @@ -1278,7 +1274,7 @@ namespace vcg assert(vp); assert(!vp->IsD()); - set set1; + std::set set1; for(HEdgeIterator hi = m.hedge.begin(); hi != m.hedge.end(); ++hi) { if(!(*hi).IsD() && (*hi).HVp() == vp) @@ -1286,9 +1282,9 @@ namespace vcg } - vector vect2 = get_incident_hedges(vp); + std::vector vect2 = get_incident_hedges(vp); - set set2; + std::set set2; set2.insert(vect2.begin(), vect2.end()); return !equal(set1.begin(), set1.end(), set2.begin()); @@ -1308,7 +1304,7 @@ namespace vcg assert(vp); assert(!vp->IsD()); - vector faces = get_incident_faces(vp); + std::vector faces = get_incident_faces(vp); unsigned int size = faces.size(); int null_count = 0; @@ -1350,19 +1346,19 @@ namespace vcg * * \return Vector containing vertices */ - static vector getVertices(VertexPointer vp) + static std::vector getVertices(VertexPointer vp) { assert(vp); assert(!vp->IsD()); HEdgePointer hp = vp->VHp(); - vector ret; + std::vector ret; if( !hp ) return ret; - Pos p(hp); + vcg::hedge::Pos p(hp); do { @@ -1392,18 +1388,18 @@ namespace vcg * * \return Set containing faces */ - static set getFaces(VertexPointer vp) + static std::set getFaces(VertexPointer vp) { assert(vp); assert(!vp->IsD()); - set ret; + std::set ret; - vector vertices = getVertices(vp); + std::vector vertices = getVertices(vp); - for(typename vector::iterator vi = vertices.begin(); vi!= vertices.end(); ++vi) + for(typename std::vector::iterator vi = vertices.begin(); vi!= vertices.end(); ++vi) { - vector incident_faces = get_incident_faces(*vi); + std::vector incident_faces = get_incident_faces(*vi); ret.insert(incident_faces.begin(), incident_faces.end()); } @@ -1426,7 +1422,7 @@ namespace vcg assert(fp->FHp()); assert(!fp->IsD()); - Pos p( fp->FHp() ); + vcg::hedge::Pos p( fp->FHp() ); do { @@ -1450,7 +1446,7 @@ namespace vcg * * \return Vector containing the incident vertices */ - static vector getVertices(FacePointer fp, HEdgePointer starting_he = NULL) + static std::vector getVertices(FacePointer fp, HEdgePointer starting_he = NULL) { assert(fp); assert(!fp->IsD()); @@ -1460,9 +1456,9 @@ namespace vcg assert( starting_he->HFp() == fp ); - Pos p( starting_he, true ); + vcg::hedge::Pos p( starting_he, true ); - vector ret; + std::vector ret; do @@ -1491,7 +1487,7 @@ namespace vcg * * \return Vector containing the incident hedges */ - static vector getHEdges(FacePointer fp, HEdgePointer starting_he = NULL) + static std::vector getHEdges(FacePointer fp, HEdgePointer starting_he = NULL) { assert(fp); assert(!fp->IsD()); @@ -1501,9 +1497,9 @@ namespace vcg else starting_he = fp->FHp(); - Pos p( starting_he, true ); + vcg::hedge::Pos p( starting_he, true ); - vector ret; + std::vector ret; do { @@ -1530,7 +1526,7 @@ namespace vcg * * \return Vector containing the incident faces */ - static vector get_incident_faces(VertexPointer vp, HEdgePointer starting_he = NULL) + static std::vector get_incident_faces(VertexPointer vp, HEdgePointer starting_he = NULL) { assert(vp); assert(!vp->IsD()); @@ -1540,12 +1536,12 @@ namespace vcg else starting_he = vp->VHp(); - vector ret; + std::vector ret; if(!starting_he) return ret; - Pos p( starting_he, true ); + vcg::hedge::Pos p( starting_he, true ); do { @@ -1561,14 +1557,14 @@ namespace vcg } - static vector get_adjacent_faces(FacePointer fp) + static std::vector get_adjacent_faces(FacePointer fp) { assert(fp); assert(!fp->IsD()); - vector ret; + std::vector ret; - Pos p( fp->FHp() ); + vcg::hedge::Pos p( fp->FHp() ); assert(p.F() == fp); do @@ -1594,7 +1590,7 @@ namespace vcg * * \return Vector containing the incident hedges */ - static vector get_incident_hedges(VertexPointer vp, HEdgePointer starting_he = NULL) + static std::vector get_incident_hedges(VertexPointer vp, HEdgePointer starting_he = NULL) { assert(vp); assert(!vp->IsD()); @@ -1604,12 +1600,12 @@ namespace vcg else starting_he = vp->VHp(); - vector ret; + std::vector ret; if(!starting_he) return ret; - Pos p( starting_he, true ); + vcg::hedge::Pos p( starting_he, true ); do { @@ -1647,15 +1643,15 @@ namespace vcg * * \return Vector containing the hedges */ - static vector find_doublet_hedges_quad(FacePointer fp) + static std::vector find_doublet_hedges_quad(FacePointer fp) { assert(fp); assert(fp->FHp()); assert(!fp->IsD()); - vector ret; + std::vector ret; - Pos p( fp->FHp(), true ); + vcg::hedge::Pos p( fp->FHp(), true ); do { @@ -1690,7 +1686,7 @@ namespace vcg if( !(vp->VHp()) ) return true; - Pos p( vp->VHp() ); + vcg::hedge::Pos p( vp->VHp() ); do { @@ -1722,7 +1718,7 @@ namespace vcg int ret = 0; - Pos p( vp->VHp() ); + vcg::hedge::Pos p( vp->VHp() ); do { @@ -1752,7 +1748,7 @@ namespace vcg assert(old_vp != new_vp); assert(!old_vp->IsD()); - Pos p(old_vp->VHp(),true); + vcg::hedge::Pos p(old_vp->VHp(),true); p.HE()->SetV(); diff --git a/vcg/complex/algorithms/voronoi_processing.h b/vcg/complex/algorithms/voronoi_processing.h index e3cf60ca..7bd437cd 100644 --- a/vcg/complex/algorithms/voronoi_processing.h +++ b/vcg/complex/algorithms/voronoi_processing.h @@ -432,7 +432,7 @@ static void ConvertVoronoiDiagramToMesh(MeshType &m, for(size_t i=0;i pt; + std::vector pt; for(size_t j=0;jV(qq)] == curSeed) @@ -457,12 +457,12 @@ static void ConvertVoronoiDiagramToMesh(MeshType &m, CoordType nZ = pl.Direction(); CoordType nX = (pt[0]-curSeed->P()).Normalize(); CoordType nY = (nX^nZ).Normalize(); - vector > angleVec(pt.size()); + std::vector > angleVec(pt.size()); for(size_t j=0;jP()).Normalize(); float angle = 180.0f+math::ToDeg(atan2(p*nY,p*nX)); - angleVec[j] = make_pair(angle,j); + angleVec[j] = std::make_pair(angle,j); } std::sort(angleVec.begin(),angleVec.end()); // Now build another piece of mesh. @@ -1230,8 +1230,8 @@ static int RestrictedVoronoiRelaxing(MeshType &m, std::vector &seedPo sumVec[seedInd].second+=vi->cP()*area[vi]; } - vector newseedVec; - vector newfixedVec; + std::vector newseedVec; + std::vector newfixedVec; for(size_t i=0;i ordered_pair(const genericType &a, co /// /// static void GenerateMidPointMap(MeshType &m, - map, VertexPointer > &midMap) + std::map, VertexPointer > &midMap) { PerVertexPointerHandle sources = tri::Allocator:: template GetPerVertexAttribute (m,"sources"); @@ -1601,10 +1601,10 @@ static void ConvertDelaunayTriangulationToMesh(MeshType &m, for(size_t i=0;i::AddVertex(outMesh, seedVec[i]->P(),Color4b::White); - map, int > midMapInd; + std::map, int > midMapInd; // Given a pair of sources gives the index of the mid vertex - map, VertexPointer > midMapPt; + std::map, VertexPointer > midMapPt; if(refineFlag) { GenerateMidPointMap(m, midMapPt); @@ -1658,7 +1658,7 @@ static void PreprocessForVoronoi(MeshType &m, ScalarType radius, for(int i=0;i(m,mid,min(edgeLen*2.0f,radius/vpp.refinementRatio)); + bool ret = tri::Refine(m,mid,std::min(edgeLen*2.0f,radius/vpp.refinementRatio)); if(!ret) break; } tri::Allocator::CompactEveryVector(m); @@ -1715,7 +1715,7 @@ static void RelaxRefineTriangulationSpring(MeshType &m, MeshType &delaMesh, int std::vector avgLenVec(delaMesh.vn,0); for(int i=0;i starVec; + std::vector starVec; face::VVStarVF(&delaMesh.vert[i],starVec); for(size_t j=0;j Date: Wed, 17 Nov 2021 17:48:00 +0100 Subject: [PATCH 13/38] fix samples after removing using namespace in header files --- .../trimesh_attribute_saving.cpp | 2 +- apps/sample/trimesh_hole/trimesh_hole.cpp | 2 +- .../trimesh_intersection_mesh.cpp | 6 +++--- apps/sample/trimesh_remeshing/trimesh_remeshing.cpp | 6 +++--- .../trimesh_texture_clean/trimesh_texture_clean.cpp | 4 ++-- vcg/complex/algorithms/curve_on_manifold.h | 10 +++++----- vcg/complex/algorithms/implicit_smooth.h | 4 ++-- vcg/complex/algorithms/update/halfedge_indexed.h | 4 ++-- 8 files changed, 19 insertions(+), 19 deletions(-) diff --git a/apps/sample/trimesh_attribute_saving/trimesh_attribute_saving.cpp b/apps/sample/trimesh_attribute_saving/trimesh_attribute_saving.cpp index 7afe64ee..400a9da4 100644 --- a/apps/sample/trimesh_attribute_saving/trimesh_attribute_saving.cpp +++ b/apps/sample/trimesh_attribute_saving/trimesh_attribute_saving.cpp @@ -45,7 +45,7 @@ class MyMesh : public vcg::tri::TriMesh< std::vector, std::vector(m, 3.0f, 1.0f); + vcg::tri::Torus(m, 3.0f, 1.0f); //! [Adding a few attributes] // add a per-vertex attribute with type float named "GaussianCurvature" MyMesh::PerVertexAttributeHandle diff --git a/apps/sample/trimesh_hole/trimesh_hole.cpp b/apps/sample/trimesh_hole/trimesh_hole.cpp index 814e6ff2..5ca6937a 100644 --- a/apps/sample/trimesh_hole/trimesh_hole.cpp +++ b/apps/sample/trimesh_hole/trimesh_hole.cpp @@ -229,7 +229,7 @@ int main(int argc,char ** argv){ { f1=f2; f2++; - TriSplit >::Apply(vf[i],&(*f1),&(*f2),&(*vertp),CenterPointBarycenter() ); + vcg::tri::TriSplit >::Apply(vf[i],&(*f1),&(*f2),&(*vertp),vcg::tri::CenterPointBarycenter() ); f1->SetS(); f2->SetS(); for(int itr=0;itr<3;itr++) diff --git a/apps/sample/trimesh_intersection_mesh/trimesh_intersection_mesh.cpp b/apps/sample/trimesh_intersection_mesh/trimesh_intersection_mesh.cpp index e60e7f10..55797140 100644 --- a/apps/sample/trimesh_intersection_mesh/trimesh_intersection_mesh.cpp +++ b/apps/sample/trimesh_intersection_mesh/trimesh_intersection_mesh.cpp @@ -70,7 +70,7 @@ int main(int ,char **) tri::UpdateSelection::FaceDilate(m1); tri::Clean::SelectIntersectingFaces(m2,m1); tri::UpdateSelection::FaceDilate(m2); - IsotropicRemeshing::Params params; + vcg::tri::IsotropicRemeshing::Params params; float len = (tri::Stat::ComputeFaceEdgeLengthAverage(m1,true) + tri::Stat::ComputeFaceEdgeLengthAverage(m1,true)); params.SetTargetLen(len*0.8f); @@ -78,8 +78,8 @@ int main(int ,char **) params.iter=1; // just one iteration to avoid overtessellating. params.selectedOnly=true; printf(" Input mesh %8i v %8i f\n",m1.VN(),m1.FN()); - IsotropicRemeshing::Do(m1, params); - IsotropicRemeshing::Do(m2, params); + vcg::tri::IsotropicRemeshing::Do(m1, params); + vcg::tri::IsotropicRemeshing::Do(m2, params); printf(" Input mesh %8i v %8i f\n",m1.VN(),m1.FN()); } tri::Clean::SelectIntersectingFaces(m1,m2); diff --git a/apps/sample/trimesh_remeshing/trimesh_remeshing.cpp b/apps/sample/trimesh_remeshing/trimesh_remeshing.cpp index 895e83f5..8eb20cf2 100644 --- a/apps/sample/trimesh_remeshing/trimesh_remeshing.cpp +++ b/apps/sample/trimesh_remeshing/trimesh_remeshing.cpp @@ -75,7 +75,7 @@ int main( int argc, char **argv ) // Mesh cleaning tri::Clean::RemoveUnreferencedVertex(original); - Allocator::CompactEveryVector(original); + vcg::tri::Allocator::CompactEveryVector(original); tri::UpdateNormal::PerVertexNormalizedPerFaceNormalized(original); @@ -90,7 +90,7 @@ int main( int argc, char **argv ) float maxSurfDist = maxSurfDistPerc*(original.bbox.Diag()/100.f); printf("Length Thr: %8.3f ~ %4.2f %% on %5.3f\n",lengthThr,targetLenPerc,original.bbox.Diag()); - IsotropicRemeshing::Params params; + vcg::tri::IsotropicRemeshing::Params params; params.SetTargetLen(lengthThr); params.SetFeatureAngleDeg(creaseAngle); params.iter=iterNum; @@ -111,7 +111,7 @@ int main( int argc, char **argv ) printf(" Input mesh %8i v %8i f\n",toremesh.VN(),toremesh.FN()); - IsotropicRemeshing::Do(toremesh, original, params); + vcg::tri::IsotropicRemeshing::Do(toremesh, original, params); vcg::tri::io::ExporterPLY::Save(toremesh, "remesh.ply"); printf("Output mesh %8i v %8i f\n",toremesh.VN(),toremesh.FN()); diff --git a/apps/sample/trimesh_texture_clean/trimesh_texture_clean.cpp b/apps/sample/trimesh_texture_clean/trimesh_texture_clean.cpp index e6e5029e..830d6b33 100644 --- a/apps/sample/trimesh_texture_clean/trimesh_texture_clean.cpp +++ b/apps/sample/trimesh_texture_clean/trimesh_texture_clean.cpp @@ -52,8 +52,8 @@ int main(int ,char ** ) // generate a simple 2D grid Grid(m,20,20,1,1); - // assign it a simple planar parametrization - tri:UpdateTexture::WedgeTexFromPlane(m,Point3f(1.0f,0,0),Point3f(0,1.0f,0),true); + // assign it a simple planar parametrization + tri::UpdateTexture::WedgeTexFromPlane(m,Point3f(1.0f,0,0),Point3f(0,1.0f,0),true); tri::io::ExporterOBJ::Save(m,"grid_0.obj",mask); // randomly perturb a few coord textures introducing fake seams diff --git a/vcg/complex/algorithms/curve_on_manifold.h b/vcg/complex/algorithms/curve_on_manifold.h index e03e1f5f..1921548c 100644 --- a/vcg/complex/algorithms/curve_on_manifold.h +++ b/vcg/complex/algorithms/curve_on_manifold.h @@ -474,9 +474,9 @@ bool TagFaceEdgeSelWithPolyLine(MeshType &poly,bool markFlag=true) { CoordType p0=f->P0(i); CoordType p1=f->P1(i); - if (p0>p1) std::swap(p0,p1); - if(edgeToPolyVertMap[make_pair(p0,p1)]) printf("Found an already used Edge %lu - %lu %lu!!!\n", tri::Index(base,f->V0(i)),tri::Index(base,f->V1(i)),tri::Index(poly,&*vi)); - edgeToPolyVertMap[make_pair(p0,p1)]=&*vi; + if (p0>p1) std::swap(p0,p1); + if(edgeToPolyVertMap[std::make_pair(p0,p1)]) printf("Found an already used Edge %lu - %lu %lu!!!\n", tri::Index(base,f->V0(i)),tri::Index(base,f->V1(i)),tri::Index(poly,&*vi)); + edgeToPolyVertMap[std::make_pair(p0,p1)]=&*vi; } } } @@ -1136,7 +1136,7 @@ public: CoordType p0 = ep.V()->P(); CoordType p1 = ep.VFlip()->P(); if (p0>p1) std::swap(p0,p1); - VertexPointer vp=edgeToPolyVertMap[make_pair(p0,p1)]; + VertexPointer vp=edgeToPolyVertMap[std::make_pair(p0,p1)]; return vp!=0; } }; @@ -1152,7 +1152,7 @@ public: CoordType p0 = ep.V()->P(); CoordType p1 = ep.VFlip()->P(); if (p0>p1) std::swap(p0,p1); - VertexPointer vp=edgeToPolyVertMap[make_pair(p0,p1)]; + VertexPointer vp=edgeToPolyVertMap[std::make_pair(p0,p1)]; assert(vp); nv.P()=vp->P(); return; diff --git a/vcg/complex/algorithms/implicit_smooth.h b/vcg/complex/algorithms/implicit_smooth.h index 1d0e3f47..a304bf8e 100644 --- a/vcg/complex/algorithms/implicit_smooth.h +++ b/vcg/complex/algorithms/implicit_smooth.h @@ -260,7 +260,7 @@ public: //add the entries for mass matrix if (SParam.useMassMatrix) - MeshToMatrix::MassMatrixEntry(mesh,IndexM,ValuesM,!SParam.SmoothQ); + vcg::tri::MeshToMatrix::MassMatrixEntry(mesh,IndexM,ValuesM,!SParam.SmoothQ); //then add entries for lagrange mult due to barycentric constraints for (size_t i=0;i > IndexL; std::vector ValuesL; - MeshToMatrix::GetLaplacianMatrix(mesh,IndexL,ValuesL,SParam.useCotWeight,SParam.lapWeight,!SParam.SmoothQ); + vcg::tri::MeshToMatrix::GetLaplacianMatrix(mesh,IndexL,ValuesL,SParam.useCotWeight,SParam.lapWeight,!SParam.SmoothQ); //initialize sparse laplacian matrix if (!SParam.SmoothQ) diff --git a/vcg/complex/algorithms/update/halfedge_indexed.h b/vcg/complex/algorithms/update/halfedge_indexed.h index 054215ef..c8ebda00 100644 --- a/vcg/complex/algorithms/update/halfedge_indexed.h +++ b/vcg/complex/algorithms/update/halfedge_indexed.h @@ -214,9 +214,9 @@ namespace vcg for(typename MeshType::EdgeIterator ei1 = m.edge.begin(); ei1 != m.edge.end(); ++ei1 ) { - vector hedges = HalfEdgeTopology::get_incident_hedges((*ei1).V(0)); + std::vector hedges = HalfEdgeTopology::get_incident_hedges((*ei1).V(0)); - for(typename vector::iterator hi = hedges.begin(); hi != hedges.end(); ++hi) + for(typename std::vector::iterator hi = hedges.begin(); hi != hedges.end(); ++hi) { if((*hi)->HOp()->HVp() == (*ei1).V(1)) { From 8bc75c8c4de05b0ae27092d8d4262ecce0f643c6 Mon Sep 17 00:00:00 2001 From: Luigi Malomo Date: Wed, 17 Nov 2021 18:21:21 +0100 Subject: [PATCH 14/38] additional save to check nanoply possible inconsistencies in load/save behavior --- wrap/nanoply/nanoply_vcg/main.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/wrap/nanoply/nanoply_vcg/main.cpp b/wrap/nanoply/nanoply_vcg/main.cpp index 37005bac..364b6ea3 100644 --- a/wrap/nanoply/nanoply_vcg/main.cpp +++ b/wrap/nanoply/nanoply_vcg/main.cpp @@ -149,5 +149,7 @@ int main() MyMesh mesh2, mesh3; Load("example_ascii.ply", mesh2); Load("example_binary.ply", mesh3); + Save("example_ascii_1.ply", mesh2, false); + Save("example_binary_1.ply", mesh3, true); return 0; } From 82ac1bdc87d11b33e64b9d28e6022b42f616d654 Mon Sep 17 00:00:00 2001 From: Luigi Malomo Date: Wed, 17 Nov 2021 23:36:06 +0100 Subject: [PATCH 15/38] remove 'using namespace std' also from comments --- wrap/gcache/cache.h | 1 - wrap/gui/frustum.h | 1 - wrap/gui/trackutils.h | 1 - 3 files changed, 3 deletions(-) diff --git a/wrap/gcache/cache.h b/wrap/gcache/cache.h index 713e7a41..4bccfd65 100644 --- a/wrap/gcache/cache.h +++ b/wrap/gcache/cache.h @@ -26,7 +26,6 @@ typedef unsigned __int64 uint64_t; #include "provider.h" -//using namespace std; /* this cache system enforce the rule that the items in a cache are always in all the cache below */ /* two mechanism to remove tokens from the cache: 1) set token count to something low diff --git a/wrap/gui/frustum.h b/wrap/gui/frustum.h index 8f6108ca..34c8de25 100644 --- a/wrap/gui/frustum.h +++ b/wrap/gui/frustum.h @@ -59,7 +59,6 @@ Adding copyright. #include #include -//using namespace std; namespace vcg { diff --git a/wrap/gui/trackutils.h b/wrap/gui/trackutils.h index 6df38318..1c5a8591 100644 --- a/wrap/gui/trackutils.h +++ b/wrap/gui/trackutils.h @@ -35,7 +35,6 @@ #include #include #include -//using namespace std; namespace vcg { From b9f865d39e717d47c71440901caf1cffec63b86f Mon Sep 17 00:00:00 2001 From: Luigi Malomo Date: Thu, 18 Nov 2021 19:23:18 +0100 Subject: [PATCH 16/38] removed qt deprecation warning --- wrap/qt/shot_qt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wrap/qt/shot_qt.h b/wrap/qt/shot_qt.h index e9ca0a6c..8c246d16 100644 --- a/wrap/qt/shot_qt.h +++ b/wrap/qt/shot_qt.h @@ -51,7 +51,7 @@ template shot.Extrinsics.SetTra(-tra); vcg::Matrix44 rot; - QStringList values = attr.namedItem("RotationMatrix").nodeValue().split(" ", QString::SkipEmptyParts); + QStringList values = attr.namedItem("RotationMatrix").nodeValue().split(" ", Qt::SkipEmptyParts); for (int y = 0; y < 4; y++) for (int x = 0; x < 4; x++) rot[y][x] = values[x + 4 * y].toDouble(); From 0c4e210bba9990d5ef0fec86ecd04ed344899f2b Mon Sep 17 00:00:00 2001 From: Luigi Malomo Date: Fri, 19 Nov 2021 16:03:05 +0100 Subject: [PATCH 17/38] added PLY import support for vertex texture coordinates as double --- wrap/io_trimesh/import_ply.h | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/wrap/io_trimesh/import_ply.h b/wrap/io_trimesh/import_ply.h index ab6bda9c..2da70ab4 100644 --- a/wrap/io_trimesh/import_ply.h +++ b/wrap/io_trimesh/import_ply.h @@ -63,6 +63,7 @@ public: typedef typename VertexType::QualityType VertQualityType; typedef typename OpenMeshType::FaceType FaceType; typedef typename FaceType::QualityType FaceQualityType; + typedef typename VertexType::TexCoordType::ScalarType TexScalarType; typedef typename OpenMeshType::VertexIterator VertexIterator; typedef typename OpenMeshType::FaceIterator FaceIterator; @@ -159,7 +160,7 @@ public: float k4; }; -#define _VERTDESC_LAST_ 32 +#define _VERTDESC_LAST_ 34 static const PropDescriptor &VertDesc(int i) { static const PropDescriptor pv[_VERTDESC_LAST_]={ @@ -189,14 +190,16 @@ public: /*22*/ {"vertex", "s", ply::T_FLOAT, ply::T_FLOAT, offsetof(LoadPly_VertAux,u),0,0,0,0,0 ,0}, /*23*/ {"vertex", "t", ply::T_FLOAT, ply::T_FLOAT, offsetof(LoadPly_VertAux,v),0,0,0,0,0 ,0}, // DOUBLE - /*24*/ {"vertex", "x", ply::T_DOUBLE, PlyType(),offsetof(LoadPly_VertAux,p),0,0,0,0,0 ,0}, - /*25*/ {"vertex", "y", ply::T_DOUBLE, PlyType(),offsetof(LoadPly_VertAux,p) + sizeof(ScalarType) ,0,0,0,0,0 ,0}, - /*26*/ {"vertex", "z", ply::T_DOUBLE, PlyType(),offsetof(LoadPly_VertAux,p) + 2*sizeof(ScalarType),0,0,0,0,0 ,0}, - /*27*/ {"vertex", "nx", ply::T_DOUBLE, PlyType(),offsetof(LoadPly_VertAux,n) ,0,0,0,0,0 ,0}, - /*28*/ {"vertex", "ny", ply::T_DOUBLE, PlyType(),offsetof(LoadPly_VertAux,n) + 1*sizeof(ScalarType),0,0,0,0,0 ,0}, - /*29*/ {"vertex", "nz", ply::T_DOUBLE, PlyType(),offsetof(LoadPly_VertAux,n) + 2*sizeof(ScalarType),0,0,0,0,0 ,0}, - /*30*/ {"vertex", "radius", ply::T_DOUBLE, PlyType(),offsetof(LoadPly_VertAux,radius),0,0,0,0,0 ,0}, - /*31*/ {"vertex", "quality", ply::T_DOUBLE, PlyType(),offsetof(LoadPly_VertAux,q),0,0,0,0,0 ,0} + /*24*/ {"vertex", "x", ply::T_DOUBLE, PlyType(), offsetof(LoadPly_VertAux,p),0,0,0,0,0 ,0}, + /*25*/ {"vertex", "y", ply::T_DOUBLE, PlyType(), offsetof(LoadPly_VertAux,p) + sizeof(ScalarType) ,0,0,0,0,0 ,0}, + /*26*/ {"vertex", "z", ply::T_DOUBLE, PlyType(), offsetof(LoadPly_VertAux,p) + 2*sizeof(ScalarType),0,0,0,0,0 ,0}, + /*27*/ {"vertex", "nx", ply::T_DOUBLE, PlyType(), offsetof(LoadPly_VertAux,n) ,0,0,0,0,0 ,0}, + /*28*/ {"vertex", "ny", ply::T_DOUBLE, PlyType(), offsetof(LoadPly_VertAux,n) + 1*sizeof(ScalarType),0,0,0,0,0 ,0}, + /*29*/ {"vertex", "nz", ply::T_DOUBLE, PlyType(), offsetof(LoadPly_VertAux,n) + 2*sizeof(ScalarType),0,0,0,0,0 ,0}, + /*30*/ {"vertex", "radius", ply::T_DOUBLE, PlyType(), offsetof(LoadPly_VertAux,radius),0,0,0,0,0 ,0}, + /*31*/ {"vertex", "quality", ply::T_DOUBLE, PlyType(), offsetof(LoadPly_VertAux,q),0,0,0,0,0 ,0}, + /*32*/ {"vertex", "texture_u", ply::T_DOUBLE, PlyType(), offsetof(LoadPly_VertAux,u),0,0,0,0,0 ,0}, + /*33*/ {"vertex", "texture_v", ply::T_DOUBLE, PlyType(), offsetof(LoadPly_VertAux,v),0,0,0,0,0 ,0}, }; return pv[i]; } @@ -514,6 +517,10 @@ public: { pi.mask |= Mask::IOM_VERTTEXCOORD; } + if(( pf.AddToRead(VertDesc(32))!=-1 )&& (pf.AddToRead(VertDesc(33))!=-1)) + { + pi.mask |= Mask::IOM_VERTTEXCOORD; + } } if(tri::HasPerVertexRadius(m)) { @@ -1198,6 +1205,9 @@ public: if( pf.AddToRead(VertDesc(18))!=-1 && pf.AddToRead(VertDesc(19))!=-1) mask |= Mask::IOM_VERTTEXCOORD; + if( pf.AddToRead(VertDesc(32))!=-1 && + pf.AddToRead(VertDesc(33))!=-1) mask |= Mask::IOM_VERTTEXCOORD; + if( pf.AddToRead(FaceDesc(0))!=-1 ) mask |= Mask::IOM_FACEINDEX; if( pf.AddToRead(FaceDesc(1))!=-1 ) mask |= Mask::IOM_FACEFLAGS; if( pf.AddToRead(FaceDesc(10))!=-1 && From f2ba3e973eb3219a3c918221fc05ff243e539ca1 Mon Sep 17 00:00:00 2001 From: korialis Date: Fri, 26 Nov 2021 16:25:17 +0100 Subject: [PATCH 18/38] cleaning of isotropic remeshing and small fix to adaptivity --- vcg/complex/algorithms/isotropic_remeshing.h | 436 ++++++++----------- 1 file changed, 185 insertions(+), 251 deletions(-) diff --git a/vcg/complex/algorithms/isotropic_remeshing.h b/vcg/complex/algorithms/isotropic_remeshing.h index 5359a3b1..9f2ab451 100644 --- a/vcg/complex/algorithms/isotropic_remeshing.h +++ b/vcg/complex/algorithms/isotropic_remeshing.h @@ -149,13 +149,13 @@ private: static void removeColinearFaces(MeshType & m, Params & params) { - vcg::tri::UpdateTopology::FaceFace(m); + vcg::tri::UpdateTopology::FaceFace(m); int count = 0; int iter = 0; do { - vcg::tri::UpdateTopology::FaceFace(m); + // vcg::tri::UpdateTopology::FaceFace(m); vcg::tri::UnMarkAll(m); count = 0; @@ -163,44 +163,49 @@ private: { FaceType & f = m.face[i]; - ScalarType quality = vcg::QualityRadii(f.cP(0), f.cP(1), f.cP(2)); + const ScalarType quality = vcg::QualityRadii(f.cP(0), f.cP(1), f.cP(2)); if (quality <= 0.001) { //find longest edge - double edges[3]; - edges[0] = vcg::Distance(f.cP(0), f.cP(1)); - edges[1] = vcg::Distance(f.cP(1), f.cP(2)); - edges[2] = vcg::Distance(f.cP(2), f.cP(0)); + const double edges[3] = { + vcg::Distance(f.cP(0), f.cP(1)), + vcg::Distance(f.cP(1), f.cP(2)), + vcg::Distance(f.cP(2), f.cP(0)) + }; - ScalarType smallestEdge = std::min(edges[0], std::min(edges[1], edges[2])); - int longestIdx = std::find(edges, edges+3, std::max(std::max(edges[0], edges[1]), edges[2])) - (edges); + const double smallestEdge = std::min(edges[0], std::min(edges[1], edges[2])); + const int longestIdx = int(std::find(edges, edges+3, std::max(std::max(edges[0], edges[1]), edges[2])) - (edges)); if (vcg::tri::IsMarked(m, f.V2(longestIdx))) continue; auto f1 = f.cFFp(longestIdx); - vcg::tri::Mark(m,f.V2(longestIdx)); + vcg::tri::Mark(m, f.V2(longestIdx)); if (!vcg::face::IsBorder(f, longestIdx) && vcg::face::IsManifold(f, longestIdx) && vcg::face::checkFlipEdgeNotManifold(f, longestIdx)) { // Check if EdgeFlipping improves quality - FacePointer g = f.FFp(longestIdx); int k = f.FFi(longestIdx); - vcg::Triangle3 t1(f.P(longestIdx), f.P1(longestIdx), f.P2(longestIdx)), t2(g->P(k), g->P1(k), g->P2(k)), - t3(f.P(longestIdx), g->P2(k), f.P2(longestIdx)), t4(g->P(k), f.P2(longestIdx), g->P2(k)); + const FacePointer g = f.FFp(longestIdx); + const int k = f.FFi(longestIdx); - auto n1 = vcg::TriangleNormal(t1); - auto n2 = vcg::TriangleNormal(t2); - auto n3 = vcg::TriangleNormal(t3); - auto n4 = vcg::TriangleNormal(t4); + const vcg::Triangle3 t1(f.P(longestIdx), f.P1(longestIdx), f.P2(longestIdx)); + const vcg::Triangle3 t2(g->P(k), g->P1(k), g->P2(k)); + const vcg::Triangle3 t3(f.P(longestIdx), g->P2(k), f.P2(longestIdx)); + const vcg::Triangle3 t4(g->P(k), f.P2(longestIdx), g->P2(k)); - auto biggestSmallest = vcg::DoubleArea(t1) > vcg::DoubleArea(t2) ? std::make_pair(t1, t2) : std::make_pair(t2, t1); - auto areaRatio = vcg::DoubleArea(biggestSmallest.first) / vcg::DoubleArea(biggestSmallest.second); + const auto n1 = vcg::TriangleNormal(t1); + const auto n2 = vcg::TriangleNormal(t2); + const auto n3 = vcg::TriangleNormal(t3); + const auto n4 = vcg::TriangleNormal(t4); + + const auto biggestSmallest = vcg::DoubleArea(t1) > vcg::DoubleArea(t2) ? std::make_pair(t1, t2) : std::make_pair(t2, t1); + const auto areaRatio = vcg::DoubleArea(biggestSmallest.first) / vcg::DoubleArea(biggestSmallest.second); bool normalCheck = true; // if (n1.Norm() > 0.001 && n2.Norm() > 0.001) { - auto referenceNormal = vcg::NormalizedTriangleNormal(biggestSmallest.first); + const auto referenceNormal = vcg::NormalizedTriangleNormal(biggestSmallest.first); normalCheck &= vcg::NormalizedTriangleNormal(t3) * referenceNormal >= 0.95; normalCheck &= vcg::NormalizedTriangleNormal(t4) * referenceNormal >= 0.95; @@ -240,9 +245,9 @@ private: vcg::tri::Clean::RemoveUnreferencedVertex(m); vcg::tri::Allocator::CompactEveryVector(m); - vcg::tri::UpdateTopology::FaceFace(m); + // vcg::tri::UpdateTopology::FaceFace(m); removeColinearFaces(m, params); - vcg::tri::UpdateTopology::FaceFace(m); + // vcg::tri::UpdateTopology::FaceFace(m); } public: @@ -262,7 +267,6 @@ public: assert(&toRemesh != &toProject); params.stat.Reset(); - tri::UpdateBounding::Box(toRemesh); { @@ -287,15 +291,15 @@ public: { if(cb) cb(100*i/params.iter, "Remeshing"); - if (params.adapt) { computeQualityDistFromRadii(toRemesh); - tri::Smooth::VertexQualityLaplacian(toRemesh, 2); + vcg::tri::Smooth::VertexQualityLaplacian(toRemesh, 2); } if(params.splitFlag) SplitLongEdges(toRemesh, params); + #ifdef DEBUG_CREASE debug_crease(toRemesh, std::string("after_ref"), i); #endif @@ -334,47 +338,46 @@ public: if (p.IsBorder()) p.F()->SetFaceEdgeS(p.E()); - // if((p.F1Flip() > p.F())) + const FaceType *ff = p.F(); + const FaceType *ffAdj = p.FFlip(); + + const double quality = vcg::QualityRadii(ff->cP(0), ff->cP(1), ff->cP(2)); + const double qualityAdj = vcg::QualityRadii(ffAdj->cP(0), ffAdj->cP(1), ffAdj->cP(2)); + + const bool qualityCheck = quality > 0.00000001 && qualityAdj > 0.00000001; + // bool areaCheck = vcg::DoubleArea(*ff) > 0.000001 && vcg::DoubleArea(*ffAdj) > 0.000001; + + if ((forceTag || !params.userSelectedCreases) && (testCreaseEdge(p, params.creaseAngleCosThr) /*&& areaCheck*/ /* && qualityCheck*/) || p.IsBorder()) { - FaceType *ff = p.F(); - FaceType *ffAdj = p.FFlip(); + PosType pp = p; + std::vector faces; + std::vector edges; + bool allOk = true; - double quality = vcg::QualityRadii(ff->cP(0), ff->cP(1), ff->cP(2)); - double qualityAdj = vcg::QualityRadii(ffAdj->cP(0), ffAdj->cP(1), ffAdj->cP(2)); - - bool qualityCheck = quality > 0.00000001 && qualityAdj > 0.00000001; - // bool areaCheck = vcg::DoubleArea(*ff) > 0.000001 && vcg::DoubleArea(*ffAdj) > 0.000001; - - if ((forceTag || !params.userSelectedCreases) && (testCreaseEdge(p, params.creaseAngleCosThr) /*&& areaCheck*//* && qualityCheck*/) || p.IsBorder()) + do { - PosType pp = p; - std::vector faces; - std::vector edges; - bool allOk = true; - - do { - faces.push_back(pp.F()); - edges.push_back(pp.E()); - // pp.F()->SetFaceEdgeS(pp.E()); - if (vcg::QualityRadii(pp.F()->cP(0), pp.F()->cP(1), pp.F()->cP(2)) <= 0.0001) - { - allOk = false; - break; - } - pp.NextF(); - } while (pp != p); - - if (allOk) + faces.push_back(pp.F()); + edges.push_back(pp.E()); + // pp.F()->SetFaceEdgeS(pp.E()); + if (vcg::QualityRadii(pp.F()->cP(0), pp.F()->cP(1), pp.F()->cP(2)) <= 0.0001) { - for (int i = 0; i < faces.size(); ++i) - { - faces[i]->SetFaceEdgeS(edges[i]); - } + allOk = false; + break; } + pp.NextF(); + } while (pp != p); - creaseQueue.push(p); + if (allOk) + { + for (int i = 0; i < faces.size(); ++i) + { + faces[i]->SetFaceEdgeS(edges[i]); + } } + + creaseQueue.push(p); } + }); return count; } @@ -392,14 +395,14 @@ private: */ IsotropicRemeshing() {} // this returns the value of cos(a) where a is the angle between n0 and n1. (scalar prod is cos(a)) - static inline ScalarType fastAngle(Point3 n0, Point3 n1) + static inline ScalarType fastAngle(const Point3 & n0, const Point3 & n1) { return math::Clamp(n0*n1,(ScalarType)-1.0,(ScalarType)1.0); } // compare the value of the scalar prod with the cos of the crease threshold - static inline bool testCreaseEdge(PosType &p, ScalarType creaseCosineThr) + static inline bool testCreaseEdge(PosType &p, const ScalarType creaseCosineThr) { - ScalarType angle = fastAngle(NormalizedTriangleNormal(*(p.F())), NormalizedTriangleNormal(*(p.FFlip()))); + const ScalarType angle = fastAngle(NormalizedTriangleNormal(*(p.F())), NormalizedTriangleNormal(*(p.FFlip()))); return angle <= creaseCosineThr && angle >= -0.98; // return (angle <= creaseCosineThr && angle >= -creaseCosineThr); } @@ -432,16 +435,15 @@ private: std::vector ff; face::VFExtendedStarVF(&*vi, 2, ff); - ScalarType tot = 0.f; - auto it = ff.begin(); - Point3 fNormal = NormalizedTriangleNormal(**it); - ++it; - while(it != ff.end()) - { - tot+= 1-math::Abs(fastAngle(fNormal, NormalizedTriangleNormal(**it))); - ++it; - } - vi->Q() = tot / (ScalarType)(std::max(1, ((int)ff.size()-1))); + assert(ff.size() > 0); + + const Point3 & fNormal = NormalizedTriangleNormal(**it); + + const auto tot = std::accumulate(++ff.begin(), ff.end(), 0.d, [&](const Scalartype acc, const FaceType * f) { + return acc + (1 - math::Abs(fastAngle(n, NormalizedTriangleNormal(*f)))); + }); + + vi->Q() = tot / (std::max(1, ((int)ff.size()-1))); vi->SetV(); } tri::Smooth::VertexQualityLaplacian(m, 3); @@ -452,8 +454,10 @@ private: tri::RequirePerVertexQuality(m); tri::UpdateTopology::FaceFace(m); // tri::UpdateFlags::VertexClearV(m); - for (size_t i=0;i seeds; ForEachFace(m, [&] (FaceType & f) { @@ -485,8 +489,8 @@ private: tri::RequirePerVertexQuality(m); tri::RequirePerFaceQuality(m); - ScalarType maxV = 0; - ScalarType minV = 10; + ScalarType maxV = std::numeric_limits::lowest(); + ScalarType minV = std::numeric_limits::max(); ForEachFace(m, [&] (FaceType & f) { f.Q() = 1. - vcg::QualityRadii(f.cP(0), f.cP(1), f.cP(2)); @@ -494,29 +498,22 @@ private: minV = std::min(minV, f.Q()); }); - //normalize - ForEachFace(m, [&] (FaceType & f) { - f.Q() = std::pow((f.Q() - minV) / (maxV - minV), 2.); + vcg::tri::UpdateQuality::VertexFromFace(m); + + maxV = std::numeric_limits::lowest(); + minV = std::numeric_limits::max(); + + //normalize in [0,1] with square reshape + ForEachVertex(m, [&] (const VertexType & v) { + maxV = std::max(maxV, v.Q()); + minV = std::min(minV, v.Q()); }); - std::vector vertMax(m.VN(), 0); - std::vector vertMin(m.VN(), 10); + const ScalarType vRange = maxV - minV + 0.000001; - ForEachFace(m, [&] (FaceType & f) { - for (int i = 0; i < 3; ++i) - { - auto vidx = vcg::tri::Index(m, f.V(i)); - vertMax[vidx] = std::max(vertMax[vidx], f.Q()); - vertMin[vidx] = std::min(vertMin[vidx], f.Q()); - } + ForEachVertex(m, [&] (VertexType & v) { + v.Q() = std::pow((v.Q() - minV) / vRange, 2.); }); - - for (size_t v = 0; v < m.VN(); ++v) - { - m.vert[v].Q() = vertMax[v] - vertMin[v]; - } - -// tri::UpdateQuality::VertexFromFace(m); } static void computeQualityDistFromHeight(MeshType & m, const ScalarType lowerBound, const ScalarType higherBound) @@ -524,10 +521,6 @@ private: tri::RequirePerVertexQuality(m); tri::RequirePerFaceQuality(m); - ScalarType maxV = 0; - ScalarType minV = 10; - - ForEachFace(m, [&] (FaceType & f) { ScalarType minH = std::numeric_limits::max(); for (int i = 0; i < 3; ++i) @@ -549,36 +542,35 @@ private: 4 for border vertices 6 for internal vertices */ - static inline int idealValence(PosType &p) + static inline int idealValence(const PosType &p) { if(p.IsBorder()) return 4; return 6; } - static inline int idealValence(VertexType &v) + static inline int idealValence(const VertexType &v) { if(v.IsB()) return 4; return 6; } - static inline int idealValenceSlow(PosType &p) + static inline int idealValenceSlow(const PosType &p) { std::vector posVec; VFOrderedStarFF(p,posVec); - float angleSumRad =0; - for(PosType &ip : posVec) - { - angleSumRad += ip.AngleRad(); - } + + const auto angleSumRad = std::accumulate(posVec.begin, posVec.end(), 0, [](const ScalarType acc, const PosType & p) { + return acc + p.AngleRad(); + }); return (int)(std::ceil(angleSumRad / (M_PI/3.0f))); } - static bool testHausdorff (MeshType & m, StaticGrid & grid, const std::vector & verts, const ScalarType maxD, const CoordType checkOrientation = CoordType(0,0,0)) + static bool testHausdorff (MeshType & m, StaticGrid & grid, const std::vector & verts, const ScalarType maxD, const CoordType & checkOrientation = CoordType(0,0,0)) { for (CoordType v : verts) { CoordType closest, normal, ip; ScalarType dist = 0; - FaceType* fp = GetClosestFaceBase(m, grid, v, maxD, dist, closest); + const FaceType* fp = GetClosestFaceBase(m, grid, v, maxD, dist, closest); //you can't use this kind of orientation check, since when you stand on edges it fails if (fp == NULL || (checkOrientation != CoordType(0,0,0) && checkOrientation * fp->N() < 0.7)) @@ -612,52 +604,54 @@ private: v3 v3 Before Swap After Swap */ - static bool testSwap(PosType p, ScalarType creaseAngleCosThr) + + //TODO: check if you can optimize using posType valence counting functions + static bool testSwap(const PosType & p, const ScalarType creaseAngleCosThr) { //if border or feature, do not swap if (/*p.IsBorder() || */p.IsEdgeS()) return false; - int oldDist = 0, newDist = 0, idealV, actualV; + int oldDist = 0, newDist = 0, idealV = 0, actualV = 0; PosType tp=p; - VertexType *v0=tp.V(); + const VertexType *v0=tp.V(); std::vector incident; - vcg::face::VVStarVF(tp.V(), incident); - idealV = idealValence(tp); actualV = incident.size(); + // vcg::face::VVStarVF(tp.V(), incident); + idealV = idealValence(tp); actualV = tp.NumberOfIncidentVertices();//int(incident.size()); oldDist += abs(idealV - actualV); newDist += abs(idealV - (actualV - 1)); tp.NextF();tp.FlipE();tp.FlipV(); - VertexType *v1=tp.V(); - vcg::face::VVStarVF(tp.V(), incident); - idealV = idealValence(tp); actualV = incident.size(); + const VertexType *v1=tp.V(); + // vcg::face::VVStarVF(tp.V(), incident); + idealV = idealValence(tp); actualV = tp.NumberOfIncidentVertices();//int(incident.size()); oldDist += abs(idealV - actualV); newDist += abs(idealV - (actualV + 1)); tp.FlipE();tp.FlipV();tp.FlipE(); - VertexType *v2=tp.V(); - vcg::face::VVStarVF(tp.V(), incident); - idealV = idealValence(tp); actualV = incident.size(); + const VertexType *v2=tp.V(); + // vcg::face::VVStarVF(tp.V(), incident); + idealV = idealValence(tp); actualV = tp.NumberOfIncidentVertices();//int(incident.size()); oldDist += abs(idealV - actualV); newDist += abs(idealV - (actualV - 1)); tp.NextF();tp.FlipE();tp.FlipV(); - VertexType *v3=tp.V(); - vcg::face::VVStarVF(tp.V(), incident); - idealV = idealValence(tp); actualV = incident.size(); + const VertexType *v3=tp.V(); + // vcg::face::VVStarVF(tp.V(), incident); + idealV = idealValence(tp); actualV = tp.NumberOfIncidentVertices();//int(incident.size()); oldDist += abs(idealV - actualV); newDist += abs(idealV - (actualV + 1)); - ScalarType qOld = std::min(Quality(v0->P(),v2->P(),v3->P()),Quality(v0->P(),v1->P(),v2->P())); - ScalarType qNew = std::min(Quality(v0->P(),v1->P(),v3->P()),Quality(v2->P(),v3->P(),v1->P())); + const ScalarType qOld = std::min(Quality(v0->P(),v2->P(),v3->P()),Quality(v0->P(),v1->P(),v2->P())); + const ScalarType qNew = std::min(Quality(v0->P(),v1->P(),v3->P()),Quality(v2->P(),v3->P(),v1->P())); return (newDist < oldDist && qNew >= qOld * 0.50f) || (newDist == oldDist && qNew > qOld * 1.f) || qNew > 1.5f * qOld; } - static bool checkManifoldness(FaceType & f, int z) + static bool checkManifoldness(const FaceType & f, const int z) { PosType pos(&f, (z+2)%3, f.V2(z)); - PosType start = pos; + const PosType start = pos; do { pos.FlipE(); @@ -672,7 +666,7 @@ private: // Edge swap step: edges are flipped in order to optimize valence and triangle quality across the mesh static void ImproveValence(MeshType &m, Params ¶ms) { - static ScalarType foldCheckRad = math::ToRad(5.); + const static ScalarType foldCheckRad = math::ToRad(5.); tri::UpdateTopology::FaceFace(m); tri::UpdateTopology::VertexFace(m); ForEachFace(m, [&] (FaceType & f) { @@ -681,10 +675,8 @@ private: { if (&f > f.cFFp(i)) { - PosType pi(&f, i); - CoordType swapEdgeMidPoint = (f.cP2(i) + f.cFFp(i)->cP2(f.cFFi(i))) / 2.; - std::vector toCheck(1, swapEdgeMidPoint); - + const PosType pi(&f, i); + const CoordType swapEdgeMidPoint = (f.cP2(i) + f.cFFp(i)->cP2(f.cFFi(i))) / 2.; if(((!params.selectedOnly) || (f.IsS() && f.cFFp(i)->IsS())) && !face::IsBorder(f, i) && @@ -692,15 +684,15 @@ private: face::checkFlipEdgeNotManifold(f, i) && testSwap(pi, params.creaseAngleCosThr) && // face::CheckFlipEdge(f, i) && - face::CheckFlipEdgeNormal(f, i, params.creaseAngleRadThr) && //vcg::math::ToRad(5.)) && - (!params.surfDistCheck || testHausdorff(*params.mProject, params.grid, toCheck, params.maxSurfDist))) + face::CheckFlipEdgeNormal(f, i, float(vcg::math::ToRad(5.))) && + (!params.surfDistCheck || testHausdorff(*params.mProject, params.grid, { swapEdgeMidPoint }, params.maxSurfDist))) { //When doing the swap we need to preserve and update the crease info accordingly FaceType* g = f.cFFp(i); - int w = f.FFi(i); + const int w = f.FFi(i); - bool creaseF = g->IsFaceEdgeS((w + 1) % 3); - bool creaseG = f.IsFaceEdgeS((i + 1) % 3); + const bool creaseF = g->IsFaceEdgeS((w + 1) % 3); + const bool creaseG = f.IsFaceEdgeS((i + 1) % 3); face::FlipEdgeNotManifold(f, i); @@ -732,9 +724,9 @@ private: bool operator()(PosType &ep) { - ScalarType quality = (((math::Abs(ep.V()->Q())+math::Abs(ep.VFlip()->Q()))/(ScalarType)2.0)-minQ)/(maxQ-minQ); - ScalarType mult = computeLengthThrMult(params, quality); - ScalarType dist = Distance(ep.V()->P(), ep.VFlip()->P()); + const ScalarType quality = ((ep.V()->Q()+ ep.VFlip()->Q())/(ScalarType)2.0); + const ScalarType mult = computeLengthThrMult(params, quality); + const ScalarType dist = Distance(ep.V()->P(), ep.VFlip()->P()); if(dist > mult * length) { ++count; @@ -769,7 +761,7 @@ private: tri::UpdateTopology::FaceFace(m); tri::MidPoint midFunctor(&m); - ScalarType minQ,maxQ; + ScalarType minQ = 0, maxQ = 0; if(params.adapt){ computeVQualityDistrMinMax(m, minQ, maxQ); EdgeSplitAdaptPred ep(params); @@ -790,20 +782,20 @@ private: static int VtoE(const int v0, const int v1) { - static /*constexpr*/ int Vmat[3][3] = { -1, 0, 2, - 0, -1, 1, - 2, 1, -1}; + static constexpr int Vmat[3][3] = { -1, 0, 2, + 0, -1, 1, + 2, 1, -1}; return Vmat[v0][v1]; } - static bool checkCanMoveOnCollapse(PosType p, std::vector & faces, std::vector & vIdxes, Params ¶ms) + static bool checkCanMoveOnCollapse(const PosType & p, const std::vector & faces, const std::vector & vIdxes, const Params ¶ms) { bool allIncidentFaceSelected = true; PosType pi = p; - CoordType dEdgeVector = (p.V()->cP() - p.VFlip()->cP()).Normalize(); + const CoordType dEdgeVector = (p.V()->cP() - p.VFlip()->cP()).Normalize(); int incidentFeatures = 0; @@ -815,7 +807,7 @@ private: { vcg::tri::Mark(*params.m,faces[i]->V1(vIdxes[i])); incidentFeatures++; - CoordType movingEdgeVector0 = (faces[i]->cP1(vIdxes[i]) - faces[i]->cP(vIdxes[i])).Normalize(); + const CoordType movingEdgeVector0 = (faces[i]->cP1(vIdxes[i]) - faces[i]->cP(vIdxes[i])).Normalize(); if (std::fabs(movingEdgeVector0 * dEdgeVector) < .9f || !p.IsEdgeS()) return false; } @@ -823,7 +815,7 @@ private: { vcg::tri::Mark(*params.m,faces[i]->V2(vIdxes[i])); incidentFeatures++; - CoordType movingEdgeVector1 = (faces[i]->cP2(vIdxes[i]) - faces[i]->cP(vIdxes[i])).Normalize(); + const CoordType movingEdgeVector1 = (faces[i]->cP2(vIdxes[i]) - faces[i]->cP(vIdxes[i])).Normalize(); if (std::fabs(movingEdgeVector1 * dEdgeVector) < .9f || !p.IsEdgeS()) return false; } @@ -836,61 +828,51 @@ private: return params.selectedOnly ? allIncidentFaceSelected : true; } - static bool checkFacesAfterCollapse (std::vector & faces, PosType p, const Point3 &mp, Params ¶ms, bool relaxed) + static bool checkFacesAfterCollapse (const std::vector & faces, const PosType & p, const Point3 &mp, Params ¶ms, bool relaxed) { for (FaceType* f : faces) { if(!(*f).IsD() && f != p.F()) //i'm not a deleted face { - PosType pi(f, p.V()); //same vertex + const PosType pi(f, p.V()); //same vertex - VertexType *v0 = pi.V(); - VertexType *v1 = pi.F()->V1(pi.VInd()); - VertexType *v2 = pi.F()->V2(pi.VInd()); + const auto v0 = pi.V(); + const auto v1 = pi.F()->V1(pi.VInd()); + const auto v2 = pi.F()->V2(pi.VInd()); if( v1 == p.VFlip() || v2 == p.VFlip()) //i'm the other deleted face continue; //check on new face quality { - ScalarType newQ = Quality(mp, v1->P(), v2->P()); - ScalarType oldQ = Quality(v0->P(), v1->P(), v2->P()); + const auto newQ = Quality(mp, v1->P(), v2->P()); + const auto oldQ = Quality(v0->P(), v1->P(), v2->P()); if(newQ <= 0.5*oldQ) return false; } // we prevent collapse that makes edges too long (except for cross) - if(!relaxed) - if((Distance(mp, v1->P()) > params.maxLength || Distance(mp, v2->P()) > params.maxLength)) - return false; - - Point3 oldN = NormalizedTriangleNormal(*(pi.F())); - Point3 newN = Normal(mp, v1->P(), v2->P()).Normalize(); - -// if (oldN * newN < 0.5f) -// return false; - - std::vector baryP(1); - baryP[0] = (v1->cP() + v2->cP() + mp) / 3.; - - if (!testHausdorff(*(params.mProject), params.grid, baryP, params.maxSurfDist, newN)) + if(!relaxed && (Distance(mp, v1->P()) > params.maxLength || Distance(mp, v2->P()) > params.maxLength)) return false; + const auto oldN = NormalizedTriangleNormal(*(pi.F())); + const auto newN = Normal(mp, v1->P(), v2->P()).Normalize(); + + if (oldN * newN < 0.7f) + return false; + //check on new face distance from original mesh if (params.surfDistCheck) { - std::vector points(3); - std::vector baryP(1); + const auto points = { + (v1->cP() + mp) / 2., + (v2->cP() + mp) / 2., + mp, + }; - baryP[0] = (v1->cP() + v2->cP() + mp) / 3.; - - points[0] = (v1->cP() + mp) / 2.; - points[1] = (v2->cP() + mp) / 2.; - points[2] = mp; - - if (!testHausdorff(*(params.mProject), params.grid, points, params.maxSurfDist))// || -// !testHausdorff(*(params.mProject), params.grid, baryP, params.maxSurfDist, newN)) + if (!testHausdorff(*(params.mProject), params.grid, points, params.maxSurfDist) || + !testHausdorff(*(params.mProject), params.grid, { (v1->cP() + v2->cP() + mp) / 3. }, params.maxSurfDist, newN)) return false; } } @@ -900,10 +882,9 @@ private: //TODO: Refactor code and implement the correct set up of crease info when collapsing towards a crease edge - static bool checkCollapseFacesAroundVert1(PosType &p, VertexPair & pair, Point3 &mp, Params ¶ms, bool relaxed) + static bool checkCollapseFacesAroundVert1(const PosType &p, VertexPair & pair, Point3 &mp, Params ¶ms, bool relaxed) { PosType p0 = p, p1 = p; - p1.FlipV(); std::vector vi0, vi1; @@ -913,8 +894,8 @@ private: face::VFStarVF(p1.V(), ff1, vi1); //check crease-moveability - bool moveable0 = checkCanMoveOnCollapse(p0, ff0, vi0, params) && !p0.V()->IsS(); - bool moveable1 = checkCanMoveOnCollapse(p1, ff1, vi1, params) && !p1.V()->IsS(); + const bool moveable0 = checkCanMoveOnCollapse(p0, ff0, vi0, params) && !p0.V()->IsS(); + const bool moveable1 = checkCanMoveOnCollapse(p1, ff1, vi1, params) && !p1.V()->IsS(); //if both moveable => go to midpoint // else collapse on movable one @@ -923,9 +904,6 @@ private: pair = moveable0 ? VertexPair(p0.V(), p1.V()) : VertexPair(p1.V(), p0.V()); - //casting int(true) is always 1 and int(false) = =0 - assert(int(true) == 1); - assert(int(false) == 0); mp = (p0.V()->cP() * int(moveable1) + p1.V()->cP() * int(moveable0)) / (int(moveable0) + int(moveable1)); if (checkFacesAfterCollapse(ff0, p0, mp, params, relaxed)) @@ -934,14 +912,15 @@ private: return false; } - static bool testCollapse1(PosType &p, VertexPair & pair, Point3 &mp, ScalarType minQ, ScalarType maxQ, Params ¶ms, bool relaxed = false) + static bool testCollapse1(const PosType &p, VertexPair & pair, Point3 &mp, ScalarType minQ, ScalarType maxQ, Params ¶ms, bool relaxed = false) { - ScalarType quality = (((math::Abs(p.V()->Q())+math::Abs(p.VFlip()->Q()))/(ScalarType)2.0)-minQ)/(maxQ-minQ); - ScalarType mult = computeLengthThrMult(params, quality); - ScalarType thr = mult*params.minLength; + const ScalarType quality = params.adapt ? ((p.V()->Q()+ p.VFlip()->Q())/(ScalarType)2.0) : 0; - ScalarType dist = Distance(p.V()->P(), p.VFlip()->P()); - ScalarType area = DoubleArea(*(p.F()))/2.f; + const ScalarType mult = computeLengthThrMult(params, quality); + const ScalarType thr = mult*params.minLength; + + const ScalarType dist = Distance(p.V()->P(), p.VFlip()->P()); + const ScalarType area = DoubleArea(*(p.F()))/2.f; if(relaxed || (dist < thr || area < params.minLength*params.minLength/100.f))//if to collapse { return checkCollapseFacesAroundVert1(p, pair, mp, params, relaxed); @@ -969,9 +948,9 @@ private: if(!(*v).IsD() && (*v).IsB() && v != p.V()) //ignore non border collapsedNV1 = ((*v).P() - p.V()->P()).normalized(); //edge vector after collapse - float cosine = cos(math::ToRad(1.5f)); - float angle0 = fabs(fastAngle(collapseNV, collapsedNV0)); - float angle1 = fabs(fastAngle(collapseNV, collapsedNV1)); + const float cosine = cos(math::ToRad(1.5f)); + const float angle0 = fabs(fastAngle(collapseNV, collapsedNV0)); + const float angle1 = fabs(fastAngle(collapseNV, collapsedNV1)); //if on both sides we deviate too much after collapse => don't collapse if(angle0 <= cosine && angle1 <= cosine) return false; @@ -984,7 +963,7 @@ private: // the linkConditions are preserved static void CollapseShortEdges(MeshType &m, Params ¶ms) { - ScalarType minQ, maxQ; + ScalarType minQ = 0, maxQ = 0; int candidates = 0; if(params.adapt) @@ -998,19 +977,18 @@ private: ss.push(); { - tri::UpdateTopology::FaceFace(m); + // tri::UpdateTopology::FaceFace(m); Clean::CountNonManifoldVertexFF(m,true); //FROM NOW ON VSelection is NotManifold - - for(auto fi=m.face.begin(); fi!=m.face.end(); ++fi) - if(!(*fi).IsD() && (params.selectedOnly == false || fi->IsS())) + ForEachFace(m, [&](FaceType &f) { + if(!f.IsD() && (params.selectedOnly == false || f.IsS())) { for(auto i=0; i<3; ++i) { - PosType pi(&*fi, i); + PosType pi(&f, i); ++candidates; - VertexPair bp = VertexPair(pi.V(), pi.VFlip()); + VertexPair bp = VertexPair(pi.V(), pi.VFlip()); Point3 mp = (pi.V()->P()+pi.VFlip()->P())/2.f; if(testCollapse1(pi, bp, mp, minQ, maxQ, params) && Collapser::LinkConditions(bp)) @@ -1022,6 +1000,7 @@ private: } } + }); } ss.pop(); } @@ -1103,52 +1082,8 @@ private: // v2 = (fv1 == v1) ? fv2 : fv1; } } - - face::VVStarVF(v0, vv0); - face::VVStarVF(v1, vv1); - face::VVStarVF(v2, vv2); - face::VVStarVF(v3, vv3); - - int nv0 = vv0.size(), nv1 = vv1.size(); - int nv2 = vv2.size(), nv3 = vv3.size(); - - int delta1 = (idealValence(*v0) - nv0) + (idealValence(*v2) - nv2); - int delta2 = (idealValence(*v1) - nv1) + (idealValence(*v3) - nv3); - - ScalarType Q1 = std::min(Quality(v0->P(), v1->P(), v3->P()), Quality(v1->P(), v2->P(), v3->P())); - ScalarType Q2 = std::min(Quality(v0->P(), v1->P(), v2->P()), Quality(v2->P(), v3->P(), v0->P())); - - if (crease[0] || crease[1] || crease[2] || crease[3]) - return false; - // if (crease[0] && crease[1] && crease[2] && crease[3]) - // { - // return false; - // } - - // if (crease[0] || crease[2]) - // { - // bp = VertexPair(p.V(), v0); - // return true; - // } - - // if (crease[1] || crease[3]) - // { - // bp = VertexPair(p.V(), v1); - // return true; - // } - - //no crease - if(delta1 < delta2 && Q1 >= 0.6f*Q2) - { - bp = VertexPair(p.V(), v1); - return true; - } - else - { - bp = VertexPair(p.V(), v0); - return true; - } } + //Cross Collapse pass: This pass cleans the mesh from cross vertices, keeping in mind the link conditions //and feature preservations tests. static void CollapseCrosses(MeshType &m , Params ¶ms) @@ -1160,19 +1095,17 @@ private: SelectionStack ss(m); ss.push(); - { tri::UpdateTopology::FaceFace(m); Clean::CountNonManifoldVertexFF(m,true); //From now on Selection on vertices is not manifoldness - - for(auto fi=m.face.begin(); fi!=m.face.end(); ++fi) - if(!(*fi).IsD() && (!params.selectedOnly || fi->IsS())) + ForEachFace(m, [&](FaceType &f) { + if(!f.IsD() && (params.selectedOnly == false || f.IsS())) { for(auto i=0; i<3; ++i) { - PosType pi(&*fi, i); + PosType pi(&f, i); if(!pi.V()->IsB()) { std::vector ff; @@ -1198,6 +1131,7 @@ private: } } } + }); } ss.pop(); From d55c7aa215ee08fe2603ae01b69b2fa774b7f63b Mon Sep 17 00:00:00 2001 From: korialis Date: Fri, 26 Nov 2021 16:34:23 +0100 Subject: [PATCH 19/38] fix typo in unused function --- vcg/complex/algorithms/isotropic_remeshing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcg/complex/algorithms/isotropic_remeshing.h b/vcg/complex/algorithms/isotropic_remeshing.h index 9f2ab451..9985e4ed 100644 --- a/vcg/complex/algorithms/isotropic_remeshing.h +++ b/vcg/complex/algorithms/isotropic_remeshing.h @@ -439,7 +439,7 @@ private: const Point3 & fNormal = NormalizedTriangleNormal(**it); - const auto tot = std::accumulate(++ff.begin(), ff.end(), 0.d, [&](const Scalartype acc, const FaceType * f) { + const auto tot = std::accumulate(++ff.begin(), ff.end(), 0.d, [&](const ScalarType acc, const FaceType * f) { return acc + (1 - math::Abs(fastAngle(n, NormalizedTriangleNormal(*f)))); }); From 44937573d08c1929ead9cf50a7827d54b7d74ed3 Mon Sep 17 00:00:00 2001 From: korialis Date: Fri, 26 Nov 2021 17:51:50 +0100 Subject: [PATCH 20/38] fix computeQuality --- vcg/complex/algorithms/isotropic_remeshing.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vcg/complex/algorithms/isotropic_remeshing.h b/vcg/complex/algorithms/isotropic_remeshing.h index 9985e4ed..6d79897a 100644 --- a/vcg/complex/algorithms/isotropic_remeshing.h +++ b/vcg/complex/algorithms/isotropic_remeshing.h @@ -437,10 +437,10 @@ private: assert(ff.size() > 0); - const Point3 & fNormal = NormalizedTriangleNormal(**it); + const Point3 fNormal = NormalizedTriangleNormal(**(ff.begin())); - const auto tot = std::accumulate(++ff.begin(), ff.end(), 0.d, [&](const ScalarType acc, const FaceType * f) { - return acc + (1 - math::Abs(fastAngle(n, NormalizedTriangleNormal(*f)))); + const auto tot = std::accumulate(++ff.begin(), ff.end(), 0., [&](const ScalarType acc, const FaceType * f) { + return acc + (1 - math::Abs(fastAngle(fNormal, NormalizedTriangleNormal(*f)))); }); vi->Q() = tot / (std::max(1, ((int)ff.size()-1))); From d66446f6dc802d9337eed2acec73369ea692557f Mon Sep 17 00:00:00 2001 From: Luigi Malomo Date: Fri, 26 Nov 2021 18:39:37 +0100 Subject: [PATCH 21/38] fixed (part of) the obj importer when loading data that is not present in the destination mesh --- vcg/complex/algorithms/update/texture.h | 2 +- wrap/io_trimesh/import_obj.h | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/vcg/complex/algorithms/update/texture.h b/vcg/complex/algorithms/update/texture.h index d67bb999..3cae8ad1 100644 --- a/vcg/complex/algorithms/update/texture.h +++ b/vcg/complex/algorithms/update/texture.h @@ -34,7 +34,7 @@ namespace tri { /// \headerfile texture.h vcg/complex/algorithms/update/texture.h -/// \brief This class is used to update/generate texcoord position according to various critera. . +/// \brief This class is used to update/generate texcoord position according to various critera. template class UpdateTexture { diff --git a/wrap/io_trimesh/import_obj.h b/wrap/io_trimesh/import_obj.h index 5b8ab94d..974468be 100644 --- a/wrap/io_trimesh/import_obj.h +++ b/wrap/io_trimesh/import_obj.h @@ -446,7 +446,7 @@ public: } - if( oi.mask & vcg::tri::io::Mask::IOM_FACECOLOR) // assigning face color + if(((oi.mask & vcg::tri::io::Mask::IOM_FACECOLOR) != 0) && HasPerFaceColor(m)) // assigning face color ff.c = currentColor; ++numTriangles; @@ -569,7 +569,10 @@ public: } // assigning face color - if( oi.mask & vcg::tri::io::Mask::IOM_FACECOLOR) ff.c = currentColor; + if( ((oi.mask & vcg::tri::io::Mask::IOM_FACECOLOR) != 0) && HasPerFaceColor(m)) + { + ff.c = currentColor; + } ff.mInd = currentMaterialIdx; @@ -672,25 +675,26 @@ public: assert(vertInd >=0 && vertInd < m.vn); (void)vertInd; m.face[i].V(j) = &(m.vert[indexedFaces[i].v[j]]); - if (((oi.mask & vcg::tri::io::Mask::IOM_WEDGTEXCOORD) != 0) && (HasPerWedgeTexCoord(m))) + if (((oi.mask & vcg::tri::io::Mask::IOM_WEDGTEXCOORD) != 0) && HasPerWedgeTexCoord(m)) { ObjTexCoord t = texCoords[indexedFaces[i].t[j]]; m.face[i].WT(j).u() = t.u; m.face[i].WT(j).v() = t.v; m.face[i].WT(j).n() = indexedFaces[i].tInd; } - if ( oi.mask & vcg::tri::io::Mask::IOM_VERTTEXCOORD ) { + if (((oi.mask & vcg::tri::io::Mask::IOM_VERTTEXCOORD) != 0 ) && HasPerVertexTexCoord(m)) + { ObjTexCoord t = texCoords[indexedFaces[i].t[j]]; m.face[i].V(j)->T().u() = t.u; m.face[i].V(j)->T().v() = t.v; m.face[i].V(j)->T().n() = indexedFaces[i].tInd; } - if ( oi.mask & vcg::tri::io::Mask::IOM_WEDGNORMAL ) + if (((oi.mask & vcg::tri::io::Mask::IOM_WEDGNORMAL) != 0) && HasPerWedgeNormal(m)) { m.face[i].WN(j).Import(normals[indexedFaces[i].n[j]]); } - if ( oi.mask & vcg::tri::io::Mask::IOM_VERTNORMAL ) + if (((oi.mask & vcg::tri::io::Mask::IOM_VERTNORMAL) != 0) && HasPerVertexNormal(m)) { m.face[i].V(j)->N().Import(normals[indexedFaces[i].n[j]]); } From d406904ee609106bd4e2f5126c732c989ec9cb73 Mon Sep 17 00:00:00 2001 From: nico Date: Sat, 27 Nov 2021 07:32:54 +1100 Subject: [PATCH 22/38] corrected SmoothIterative to use weights --- .../parametrization/tangent_field_operators.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/vcg/complex/algorithms/parametrization/tangent_field_operators.h b/vcg/complex/algorithms/parametrization/tangent_field_operators.h index 6e2d3087..58b706be 100644 --- a/vcg/complex/algorithms/parametrization/tangent_field_operators.h +++ b/vcg/complex/algorithms/parametrization/tangent_field_operators.h @@ -561,7 +561,8 @@ public: static void SmoothIterative(MeshType &mesh,int NDir=4, int NSteps=3, bool FixSelected=false, - bool UseOnlyUnSelected=false) + bool UseOnlyUnSelected=false, + ScalarType weightByQ=false) { typedef typename MeshType::FaceType FaceType; @@ -580,6 +581,7 @@ public: std::vector TangVect; std::vector Norms; FaceType *f0=&mesh.face[i]; + std::vector Weights; for (int j=0;jVN();j++) { FaceType *f1=f0->FFp(j); @@ -588,9 +590,20 @@ public: if (f0==f1)continue; TangVect.push_back(f1->PD1()); Norms.push_back(f1->N()); + if (weightByQ) + Weights.push_back(f1->Q()); + else + Weights.push_back(1); } + + //add its own value + if (weightByQ) + Weights.push_back(f0->Q()); + else + Weights.push_back(1); + assert(Norms.size()>0); - std::vector Weights; + Weights.resize(Norms.size(),1/(ScalarType)Norms.size()); NewPD1[i]=InterpolateCrossField(TangVect,Weights,Norms,f0->N(),NDir); } From 3b9e18cf4f7540c1d651dcf265ac570c56c56f07 Mon Sep 17 00:00:00 2001 From: nico Date: Sat, 27 Nov 2021 07:36:58 +1100 Subject: [PATCH 23/38] corrected one Invert call to Inverse in line 215 --- vcg/math/similarity.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vcg/math/similarity.h b/vcg/math/similarity.h index ddc8751c..6292253a 100644 --- a/vcg/math/similarity.h +++ b/vcg/math/similarity.h @@ -212,13 +212,13 @@ template void Similarity::FromMatri tra[2] = t.ElementAt(2, 3);t[2][3] = 0.0; rot.FromMatrix(t); - Invert(t); + t=Inverse(t); tra = t * tra; tra/= sca; } -template Similarity &Invert(Similarity &a) { +template Similarity &Invert(Similarity &a) { a.rot.Invert(); a.sca = 1/a.sca; a.tra = a.rot.Rotate(-a.tra)*a.sca; From c480831b99c2c829d94392e06357f5dafb3a6e2b Mon Sep 17 00:00:00 2001 From: nico Date: Sat, 27 Nov 2021 07:38:13 +1100 Subject: [PATCH 24/38] Corrected some bug including weighting schema of direction potentially wrong --- wrap/igl/smooth_field.h | 168 +++++++++++++++++++++++----------------- 1 file changed, 95 insertions(+), 73 deletions(-) diff --git a/wrap/igl/smooth_field.h b/wrap/igl/smooth_field.h index 9629a883..7a08df93 100644 --- a/wrap/igl/smooth_field.h +++ b/wrap/igl/smooth_field.h @@ -46,7 +46,7 @@ namespace vcg { namespace tri { -enum SmoothMethod{SMMiq,SMNPoly}; +enum SmoothMethod{SMMiq,SMNPoly,SMIterative}; template class FieldSmoother @@ -59,6 +59,7 @@ class FieldSmoother static void InitQualityByAnisotropyDir(MeshType &mesh) { + ScalarType minV=0.00001; std::vector QVal; for (size_t i=0;i1)NMax=1; - if (NMax<-1)NMax=-1; + if (NMax>1)NMax=1; + if (NMax<-1)NMax=-1; - if (NMin>1)NMin=1; - if (NMin<-1)NMin=-1; + if (NMin>1)NMin=1; + if (NMin<-1)NMin=-1; - ScalarType CurvAni=(NMax-NMin)/2; - mesh.vert[i].Q()=CurvAni; + ScalarType CurvAni=(NMax-NMin)/2; + CurvAni=std::max(CurvAni,minV); + mesh.vert[i].Q()=CurvAni; } vcg::tri::UpdateQuality::FaceFromVertex(mesh); } @@ -148,10 +150,10 @@ class FieldSmoother //hard constraints have selected face static void CollectHardConstraints( MeshType & mesh, - Eigen::VectorXi &HardI, - Eigen::MatrixXd &HardD, - SmoothMethod SMethod, - int Ndir) + Eigen::VectorXi &HardI, + Eigen::MatrixXd &HardD, + SmoothMethod SMethod, + int Ndir) { //count number of hard constraints int numS=vcg::tri::UpdateSelection::FaceCount(mesh); @@ -217,7 +219,6 @@ class FieldSmoother SoftD(curr_index,2)=dir.Z(); SoftW(curr_index,0)=mesh.face[i].Q(); - curr_index++; } @@ -333,6 +334,14 @@ class FieldSmoother Dir.Normalize(); } + + static void GloballyOrient(MeshType &mesh) + { + vcg::tri::CrossField::MakeDirectionFaceCoherent(mesh,true); + } + + + public: struct SmoothParam @@ -353,6 +362,8 @@ public: int curvRing; //this are additional hard constraints std::vector > AddConstr; + //the number of iteration in case of iterative method + size_t IteN; SmoothParam() { @@ -363,48 +374,11 @@ public: SmoothM=SMMiq; sharp_thr=0.0; curv_thr=0.4; + IteN=20; } }; - static void SelectConstraints(MeshType &mesh,SmoothParam &SParam) - { - //clear all selected faces - vcg::tri::UpdateFlags::FaceClear(mesh); - - //add curvature hard constraints - //ScalarType Ratio=mesh.bbox.Diag()*0.01; - - if (SParam.curv_thr>0) - AddCurvatureConstraints(mesh,SParam.curv_thr);///Ratio); - - //add alignment to sharp features - if (SParam.sharp_thr>0) - AddSharpEdgesConstraints(mesh,SParam.sharp_thr); - - //add border constraints - if (SParam.align_borders) - AddBorderConstraints(mesh); - - //aff final constraints - for (size_t i=0;i::MakeDirectionFaceCoherent(mesh,true); - } - - static void InitByCurvature(MeshType & mesh, unsigned Nring, bool UpdateFaces=true) @@ -434,12 +408,48 @@ public: InitQualityByAnisotropyDir(mesh); } - static void SmoothDirections(MeshType &mesh, - int Ndir, - SmoothMethod SMethod=SMNPoly, - bool HardAsS=true, - ScalarType alphaSoft=0) +private: + + static void SelectConstraints(MeshType &mesh,SmoothParam &SParam) { + //clear all selected faces + vcg::tri::UpdateFlags::FaceClear(mesh); + + //add curvature hard constraints + //ScalarType Ratio=mesh.bbox.Diag()*0.01; + + if (SParam.curv_thr>0) + AddCurvatureConstraints(mesh,SParam.curv_thr);///Ratio); + + //add alignment to sharp features + if (SParam.sharp_thr>0) + AddSharpEdgesConstraints(mesh,SParam.sharp_thr); + + //add border constraints + if (SParam.align_borders) + AddBorderConstraints(mesh); + + //aff final constraints + for (size_t i=0;i0)&&(SMethod==SMMiq)) - CollectSoftConstraints(mesh,SoftI,SoftD,SoftW); + CollectSoftConstraints(mesh,SoftI,SoftD,SoftW); //add some hard constraints if are not present int numC=3; @@ -489,9 +499,9 @@ public: } } - //finally smooth + //finally smooth if (SMethod==SMMiq) - SmoothMIQ(mesh,HardI,HardD,SoftI,SoftD,SoftW,alphaSoft,Ndir); + SmoothMIQ(mesh,HardI,HardD,SoftI,SoftD,SoftW,alphaSoft,Ndir); else { assert(SMethod==SMNPoly); @@ -499,28 +509,40 @@ public: } } +public: static void SmoothDirections(MeshType &mesh,SmoothParam SParam) { - //for the moment only cross and line field -// //initialize direction by curvature if needed - if ((SParam.alpha_curv>0)|| - (SParam.sharp_thr>0)|| - (SParam.curv_thr>0)) + if ((SParam.SmoothM==SMMiq)||(SParam.SmoothM==SMNPoly)) { - InitByCurvature(mesh,SParam.curvRing); - SelectConstraints(mesh,SParam); + // //initialize direction by curvature if needed + if ((SParam.alpha_curv>0)|| + (SParam.sharp_thr>0)|| + (SParam.curv_thr>0)) + { + InitByCurvature(mesh,SParam.curvRing); + SelectConstraints(mesh,SParam); + } + else + { + SelectConstraints(mesh,SParam); + vcg::tri::CrossField::PropagateFromSelF(mesh); + } + SmoothDirectionsIGL(mesh,SParam.Ndir,SParam.SmoothM,true,SParam.alpha_curv); } else { - SelectConstraints(mesh,SParam); - vcg::tri::CrossField::PropagateFromSelF(mesh); + std::cout<<"ITERATIVE"<0)||(SParam.curv_thr>0)) + SelectConstraints(mesh,SParam); + + bool weightByAni=(SParam.alpha_curv>0); + vcg::tri::CrossField::SmoothIterative(mesh,SParam.Ndir,(int)SParam.IteN,true,false,weightByAni); } - - - //then do the actual smooth - SmoothDirections(mesh,SParam.Ndir,SParam.SmoothM,true,SParam.alpha_curv); } }; From f5cec3e7949691cb50aeb2b403a12fd42706ab51 Mon Sep 17 00:00:00 2001 From: nico Date: Mon, 29 Nov 2021 07:17:31 +1100 Subject: [PATCH 25/38] changed numeric_limits::lowest with -std::numeric_limits::max as it was creating problems with percentile in hist --- vcg/complex/algorithms/stat.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vcg/complex/algorithms/stat.h b/vcg/complex/algorithms/stat.h index 40eca014..bb1eec41 100644 --- a/vcg/complex/algorithms/stat.h +++ b/vcg/complex/algorithms/stat.h @@ -99,7 +99,7 @@ public: static std::pair ComputePerFaceQualityMinMax( const MeshType & m) { tri::RequirePerFaceQuality(m); - std::pair minmax = std::make_pair(std::numeric_limits::max(),-std::numeric_limits::lowest()); + std::pair minmax = std::make_pair(std::numeric_limits::max(),-std::numeric_limits::max()); ConstFaceIterator fi; for(fi = m.face.begin(); fi != m.face.end(); ++fi) @@ -122,7 +122,7 @@ public: static std::pair ComputePerTetraQualityMinMax(const MeshType & m) { tri::RequirePerTetraQuality(m); - std::pair minmax = std::make_pair(std::numeric_limits::max(), std::numeric_limits::lowest()); + std::pair minmax = std::make_pair(std::numeric_limits::max(), -std::numeric_limits::max()); ForEachTetra(m, [&minmax] (const TetraType & t) { if (t.cQ() < minmax.first) minmax.first = t.cQ(); @@ -179,7 +179,7 @@ public: static std::pair ComputePerEdgeQualityMinMax(const MeshType & m) { tri::RequirePerEdgeQuality(m); - std::pair minmax = std::make_pair(std::numeric_limits::max(),std::numeric_limits::lowest()); + std::pair minmax = std::make_pair(std::numeric_limits::max(),-std::numeric_limits::max()); EdgeIterator ei; for(ei = m.edge.begin(); ei != m.edge.end(); ++ei) From dde8bf219ef1cf50bd9964552ec3f27a8e307450 Mon Sep 17 00:00:00 2001 From: Luigi Malomo Date: Mon, 29 Nov 2021 09:56:06 +0100 Subject: [PATCH 26/38] refactored and corrected min/max quality stat functions --- vcg/complex/algorithms/stat.h | 165 ++++++++++++++++++---------------- 1 file changed, 89 insertions(+), 76 deletions(-) diff --git a/vcg/complex/algorithms/stat.h b/vcg/complex/algorithms/stat.h index bb1eec41..e7c4e787 100644 --- a/vcg/complex/algorithms/stat.h +++ b/vcg/complex/algorithms/stat.h @@ -35,7 +35,7 @@ namespace vcg { -namespace tri{ +namespace tri { template class Stat { @@ -59,78 +59,106 @@ public: typedef typename MeshType::TetraContainer TetraContainer; typedef typename vcg::Box3 Box3Type; - static void ComputePerVertexQualityMinMax(const MeshType & m, ScalarType &minV, ScalarType &maxV) - { - std::pair pp = ComputePerVertexQualityMinMax(m); - - minV=pp.first; - maxV=pp.second; - } - static std::pair ComputePerVertexQualityMinMax(const MeshType & m) - { -// assert(0); - tri::RequirePerVertexQuality(m); - /** Please if you need to create an attribute called minmaxQ, implement an - explicit function that does it. This function should take a const Mesh. **/ - //typename MeshType::template PerMeshAttributeHandle < std::pair > mmqH; - //mmqH = tri::Allocator::template GetPerMeshAttribute >(m,"minmaxQ"); + static void ComputePerVertexQualityMinMax(const MeshType & m, ScalarType &minV, ScalarType &maxV) + { + const auto minmax = ComputePerVertexQualityMinMax(m); - std::pair minmax = std::make_pair(std::numeric_limits::max(), -std::numeric_limits::max()); + minV = minmax.first; + maxV = minmax.second; + } - for(ConstVertexIterator vi = m.vert.begin(); vi != m.vert.end(); ++vi) - if(!(*vi).IsD()) - { - if( (*vi).Q() < minmax.first) minmax.first = (*vi).Q(); - if( (*vi).Q() > minmax.second) minmax.second = (*vi).Q(); - } + static std::pair ComputePerVertexQualityMinMax(const MeshType & m) + { + /** Please if you need to create an attribute called minmaxQ, implement an + explicit function that does it. This function should take a const Mesh. **/ - //mmqH() = minmax; - return minmax; - } + tri::RequirePerVertexQuality(m); + std::pair minmax = std::make_pair(std::numeric_limits::max(), + std::numeric_limits::lowest()); - static void ComputePerFaceQualityMinMax(const MeshType & m, ScalarType &minV, ScalarType &maxV) - { - std::pair pp = ComputePerFaceQualityMinMax(m); - minV=pp.first; - maxV=pp.second; - } + ForEachVertex(m, [&minmax] (const VertexType & v) + { + if( v.Q() < minmax.first) + minmax.first = v.Q(); + if( v.Q() > minmax.second) + minmax.second = v.Q(); + }); - static std::pair ComputePerFaceQualityMinMax( const MeshType & m) - { - tri::RequirePerFaceQuality(m); - std::pair minmax = std::make_pair(std::numeric_limits::max(),-std::numeric_limits::max()); + return minmax; + } - ConstFaceIterator fi; - for(fi = m.face.begin(); fi != m.face.end(); ++fi) - if(!(*fi).IsD()) - { - if( (*fi).Q() < minmax.first) minmax.first = (*fi).Q(); - if( (*fi).Q() > minmax.second) minmax.second = (*fi).Q(); - } - return minmax; - } + static void ComputePerFaceQualityMinMax(const MeshType & m, ScalarType &minV, ScalarType &maxV) + { + const auto minmax = ComputePerFaceQualityMinMax(m); - static void ComputePerTetraQualityMinMax(MeshType & m, ScalarType & minQ, ScalarType & maxQ) - { - std::pair minmax = ComputerPerTetraQualityMinMax(m); + minV = minmax.first; + maxV = minmax.second; + } - minQ = minmax.first; - maxQ = minmax.second; - } + static std::pair ComputePerFaceQualityMinMax(const MeshType & m) + { + tri::RequirePerFaceQuality(m); + std::pair minmax = std::make_pair(std::numeric_limits::max(), + std::numeric_limits::lowest()); - static std::pair ComputePerTetraQualityMinMax(const MeshType & m) - { - tri::RequirePerTetraQuality(m); - std::pair minmax = std::make_pair(std::numeric_limits::max(), -std::numeric_limits::max()); + ForEachTetra(m, [&minmax] (const FaceType & f) { + if (f.cQ() < minmax.first) + minmax.first = f.cQ(); + if (f.cQ() > minmax.second) + minmax.second = f.cQ(); + }); - ForEachTetra(m, [&minmax] (const TetraType & t) { - if (t.cQ() < minmax.first) minmax.first = t.cQ(); - if (t.cQ() > minmax.second) minmax.second = t.cQ(); - }); + return minmax; + } - return minmax; - } + static void ComputePerTetraQualityMinMax(const MeshType & m, ScalarType & minQ, ScalarType & maxQ) + { + const auto minmax = ComputePerTetraQualityMinMax(m); + + minQ = minmax.first; + maxQ = minmax.second; + } + + static std::pair ComputePerTetraQualityMinMax(const MeshType & m) + { + tri::RequirePerTetraQuality(m); + std::pair minmax = std::make_pair(std::numeric_limits::max(), + std::numeric_limits::lowest()); + + ForEachTetra(m, [&minmax] (const TetraType & t) { + if (t.cQ() < minmax.first) + minmax.first = t.cQ(); + if (t.cQ() > minmax.second) + minmax.second = t.cQ(); + }); + + return minmax; + } + + static std::pair ComputePerEdgeQualityMinMax(const MeshType & m, ScalarType & minQ, ScalarType & maxQ) + { + const auto minmax = ComputePerEdgeQualityMinMax(m); + + minQ = minmax.first; + maxQ = minmax.second; + } + + static std::pair ComputePerEdgeQualityMinMax(const MeshType & m) + { + tri::RequirePerEdgeQuality(m); + std::pair minmax = std::make_pair(std::numeric_limits::max(), + std::numeric_limits::lowest()); + + ForEachEdge(m, [&minmax] (const EdgeType & e) { + if (e.cQ() < minmax.first) + minmax.first = e.cQ(); + if (e.cQ() > minmax.second) + minmax.second = e.cQ(); + }); + + return minmax; + } static ScalarType ComputePerTetraQualityAvg(const MeshType & m) { @@ -176,21 +204,6 @@ public: return (AvgQ/(ScalarType)num); } - static std::pair ComputePerEdgeQualityMinMax(const MeshType & m) - { - tri::RequirePerEdgeQuality(m); - std::pair minmax = std::make_pair(std::numeric_limits::max(),-std::numeric_limits::max()); - - EdgeIterator ei; - for(ei = m.edge.begin(); ei != m.edge.end(); ++ei) - if(!(*ei).IsD()) - { - if( (*ei).cQ() < minmax.first) minmax.first =(*ei).cQ(); - if( (*ei).cQ() > minmax.second) minmax.second=(*ei).cQ(); - } - return minmax; - } - /** \short compute the pointcloud barycenter. E.g. it assume each vertex has a mass. If useQualityAsWeight is true, vertex quality is the mass of the vertices From 1b2e3655ad5ab94b767f601f68a9886697bcadd0 Mon Sep 17 00:00:00 2001 From: Luigi Malomo Date: Mon, 29 Nov 2021 09:57:43 +0100 Subject: [PATCH 27/38] fixed .obj material import (it was reading only malformed .mtl) --- wrap/io_trimesh/import_obj.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/wrap/io_trimesh/import_obj.h b/wrap/io_trimesh/import_obj.h index 974468be..ebd20578 100644 --- a/wrap/io_trimesh/import_obj.h +++ b/wrap/io_trimesh/import_obj.h @@ -1005,7 +1005,7 @@ public: } else first = false; - //strcpy(currentMaterial.name, tokens[1].c_str()); + if(tokens.size() < 2) return false; else if (tokens.size() == 2) @@ -1014,17 +1014,17 @@ public: currentMaterial.materialName = line.substr(7); //space in the name, get everything after "newmtl " } else if (header.compare("Ka")==0) { - if (tokens.size() < 4) { + if (tokens.size() >= 4) { currentMaterial.Ka = Point3fFrom3Tokens(tokens,1); } } else if (header.compare("Kd")==0) { - if (tokens.size() < 4) { + if (tokens.size() >= 4) { currentMaterial.Kd = Point3fFrom3Tokens(tokens,1); } } else if (header.compare("Ks")==0) { - if (tokens.size() < 4) { + if (tokens.size() >= 4) { currentMaterial.Ks = Point3fFrom3Tokens(tokens,1); } } From 5eb0e6cff6c8ec74a6c5279feb7c037bf6b8463a Mon Sep 17 00:00:00 2001 From: Luigi Malomo Date: Mon, 29 Nov 2021 10:08:58 +0100 Subject: [PATCH 28/38] stat bugfix --- vcg/complex/algorithms/stat.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vcg/complex/algorithms/stat.h b/vcg/complex/algorithms/stat.h index e7c4e787..1b402af7 100644 --- a/vcg/complex/algorithms/stat.h +++ b/vcg/complex/algorithms/stat.h @@ -73,7 +73,6 @@ public: explicit function that does it. This function should take a const Mesh. **/ tri::RequirePerVertexQuality(m); - std::pair minmax = std::make_pair(std::numeric_limits::max(), std::numeric_limits::lowest()); @@ -102,7 +101,7 @@ public: std::pair minmax = std::make_pair(std::numeric_limits::max(), std::numeric_limits::lowest()); - ForEachTetra(m, [&minmax] (const FaceType & f) { + ForEachFace(m, [&minmax] (const FaceType & f) { if (f.cQ() < minmax.first) minmax.first = f.cQ(); if (f.cQ() > minmax.second) From da3c5c2b96d56c975ec4aae7c415b9eba2c25d66 Mon Sep 17 00:00:00 2001 From: Luigi Malomo Date: Mon, 29 Nov 2021 12:15:37 +0100 Subject: [PATCH 29/38] initializer list corrected --- vcg/complex/algorithms/isotropic_remeshing.h | 4 ++-- vcg/complex/foreach.h | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/vcg/complex/algorithms/isotropic_remeshing.h b/vcg/complex/algorithms/isotropic_remeshing.h index 6d79897a..208a1181 100644 --- a/vcg/complex/algorithms/isotropic_remeshing.h +++ b/vcg/complex/algorithms/isotropic_remeshing.h @@ -685,7 +685,7 @@ private: testSwap(pi, params.creaseAngleCosThr) && // face::CheckFlipEdge(f, i) && face::CheckFlipEdgeNormal(f, i, float(vcg::math::ToRad(5.))) && - (!params.surfDistCheck || testHausdorff(*params.mProject, params.grid, { swapEdgeMidPoint }, params.maxSurfDist))) + (!params.surfDistCheck || testHausdorff(*params.mProject, params.grid, {{ swapEdgeMidPoint }}, params.maxSurfDist))) { //When doing the swap we need to preserve and update the crease info accordingly FaceType* g = f.cFFp(i); @@ -872,7 +872,7 @@ private: }; if (!testHausdorff(*(params.mProject), params.grid, points, params.maxSurfDist) || - !testHausdorff(*(params.mProject), params.grid, { (v1->cP() + v2->cP() + mp) / 3. }, params.maxSurfDist, newN)) + !testHausdorff(*(params.mProject), params.grid, {{ (v1->cP() + v2->cP() + mp) / 3. }}, params.maxSurfDist, newN)) return false; } } diff --git a/vcg/complex/foreach.h b/vcg/complex/foreach.h index c2fa2cef..fde75fff 100644 --- a/vcg/complex/foreach.h +++ b/vcg/complex/foreach.h @@ -244,7 +244,7 @@ inline void ForEachEdge(MeshType &m, Callable action) { for(auto ei=m.edge.begin();ei!=m.edge.end();++ei) { action(*ei); - } + } } else { @@ -252,7 +252,7 @@ inline void ForEachEdge(MeshType &m, Callable action) if(!(*ei).IsD()) { action(*ei); - } + } } } @@ -273,7 +273,7 @@ inline void ForEachTetra(const MeshType &m, Callable action) { for(auto ti = m.tetra.begin(); ti != m.tetra.end(); ++ti) { action(*ti); - } + } } else { @@ -281,7 +281,7 @@ inline void ForEachTetra(const MeshType &m, Callable action) if(!(*ti).IsD()) { action(*ti); - } + } } } @@ -292,7 +292,7 @@ inline void ForEachTetra(MeshType &m, Callable action) { for(auto ti = m.tetra.begin(); ti != m.tetra.end(); ++ti) { action(*ti); - } + } } else { From be8f3e773fe46e991945df4674ca24cdce196195 Mon Sep 17 00:00:00 2001 From: Luigi Malomo Date: Mon, 29 Nov 2021 12:46:50 +0100 Subject: [PATCH 30/38] fixed 'auto' abuse --- vcg/complex/algorithms/isotropic_remeshing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcg/complex/algorithms/isotropic_remeshing.h b/vcg/complex/algorithms/isotropic_remeshing.h index 208a1181..5ddfa7d2 100644 --- a/vcg/complex/algorithms/isotropic_remeshing.h +++ b/vcg/complex/algorithms/isotropic_remeshing.h @@ -865,7 +865,7 @@ private: //check on new face distance from original mesh if (params.surfDistCheck) { - const auto points = { + const std::vector points { (v1->cP() + mp) / 2., (v2->cP() + mp) / 2., mp, From 8c15e83600f08442fe8494a3ade5880cfed494a6 Mon Sep 17 00:00:00 2001 From: korialis Date: Thu, 2 Dec 2021 10:07:33 +0100 Subject: [PATCH 31/38] restore valence counting with vfstarvf --- vcg/complex/algorithms/isotropic_remeshing.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/vcg/complex/algorithms/isotropic_remeshing.h b/vcg/complex/algorithms/isotropic_remeshing.h index 5ddfa7d2..aa142888 100644 --- a/vcg/complex/algorithms/isotropic_remeshing.h +++ b/vcg/complex/algorithms/isotropic_remeshing.h @@ -619,26 +619,30 @@ private: std::vector incident; - // vcg::face::VVStarVF(tp.V(), incident); - idealV = idealValence(tp); actualV = tp.NumberOfIncidentVertices();//int(incident.size()); + vcg::face::VVStarVF(tp.V(), incident); + idealV = idealValence(tp); + actualV = int(incident.size()); //tp.NumberOfIncidentVertices(); oldDist += abs(idealV - actualV); newDist += abs(idealV - (actualV - 1)); tp.NextF();tp.FlipE();tp.FlipV(); const VertexType *v1=tp.V(); - // vcg::face::VVStarVF(tp.V(), incident); - idealV = idealValence(tp); actualV = tp.NumberOfIncidentVertices();//int(incident.size()); + vcg::face::VVStarVF(tp.V(), incident); + idealV = idealValence(tp); + actualV = int(incident.size()); //tp.NumberOfIncidentVertices(); oldDist += abs(idealV - actualV); newDist += abs(idealV - (actualV + 1)); tp.FlipE();tp.FlipV();tp.FlipE(); const VertexType *v2=tp.V(); - // vcg::face::VVStarVF(tp.V(), incident); - idealV = idealValence(tp); actualV = tp.NumberOfIncidentVertices();//int(incident.size()); + vcg::face::VVStarVF(tp.V(), incident); + idealV = idealValence(tp); + actualV = int(incident.size()); //tp.NumberOfIncidentVertices(); oldDist += abs(idealV - actualV); newDist += abs(idealV - (actualV - 1)); tp.NextF();tp.FlipE();tp.FlipV(); const VertexType *v3=tp.V(); - // vcg::face::VVStarVF(tp.V(), incident); - idealV = idealValence(tp); actualV = tp.NumberOfIncidentVertices();//int(incident.size()); + vcg::face::VVStarVF(tp.V(), incident); + idealV = idealValence(tp); + actualV = int(incident.size());//tp.NumberOfIncidentVertices(); oldDist += abs(idealV - actualV); newDist += abs(idealV - (actualV + 1)); const ScalarType qOld = std::min(Quality(v0->P(),v2->P(),v3->P()),Quality(v0->P(),v1->P(),v2->P())); From b95d6abc98dc18030593f195068b731854fae30e Mon Sep 17 00:00:00 2001 From: korialis Date: Thu, 2 Dec 2021 10:35:22 +0100 Subject: [PATCH 32/38] fix namespace --- wrap/io_tetramesh/import_ply.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/wrap/io_tetramesh/import_ply.h b/wrap/io_tetramesh/import_ply.h index ebd4589a..40fe4fc2 100644 --- a/wrap/io_tetramesh/import_ply.h +++ b/wrap/io_tetramesh/import_ply.h @@ -204,7 +204,7 @@ public: static int Open( OpenMeshType &m, const char * filename, PlyInfo & pi ) { assert(filename!=0); - vector index; + std::vector index; LoadPly_TetraAux fa; LoadPly_VertAux va; @@ -273,8 +273,8 @@ public: } } // Descrittori definiti dall'utente, - vector VPV(pi.vdn); // property descriptor relative al tipo LoadPly_VertexAux - vector FPV(pi.fdn); // property descriptor relative al tipo LoadPly_FaceAux + std::vector VPV(pi.vdn); // property descriptor relative al tipo LoadPly_VertexAux + std::vector FPV(pi.fdn); // property descriptor relative al tipo LoadPly_FaceAux if(pi.vdn>0){ // Compute the total size needed to load additional per vertex data. @@ -318,7 +318,7 @@ public: int j; pf.SetCurElement(i); - VertexIterator vi=Allocator::AddVertices(m,n); + VertexIterator vi=vcg::tri::Allocator::AddVertices(m,n); for(j=0;j Date: Thu, 2 Dec 2021 10:42:01 +0100 Subject: [PATCH 33/38] fix namespace --- wrap/io_tetramesh/import_ply.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wrap/io_tetramesh/import_ply.h b/wrap/io_tetramesh/import_ply.h index 40fe4fc2..a5a8c489 100644 --- a/wrap/io_tetramesh/import_ply.h +++ b/wrap/io_tetramesh/import_ply.h @@ -363,7 +363,7 @@ public: { int j; int k; - TetraIterator ti = Allocator::AddTetras(m, n); + TetraIterator ti = vcg::tri::Allocator::AddTetras(m, n); pf.SetCurElement(i); for(j=0;j Date: Mon, 13 Dec 2021 10:08:54 +0100 Subject: [PATCH 34/38] removing template arguments to swap, it hurts msvc --- vcg/space/intersection2.h | 772 +++++++++++++++++++------------------- 1 file changed, 386 insertions(+), 386 deletions(-) diff --git a/vcg/space/intersection2.h b/vcg/space/intersection2.h index 115992a6..275affc0 100644 --- a/vcg/space/intersection2.h +++ b/vcg/space/intersection2.h @@ -1,386 +1,386 @@ -/**************************************************************************** -* VCGLib o o * -* Visual and Computer Graphics Library o o * -* _ O _ * -* Copyright(C) 2004-2016 \/)\/ * -* Visual Computing Lab /\/| * -* ISTI - Italian National Research Council | * -* \ * -* All rights reserved. * -* * -* This program is free software; you can redistribute it and/or modify * -* it under the terms of the GNU General Public License as published by * -* the Free Software Foundation; either version 2 of the License, or * -* (at your option) any later version. * -* * -* This program is distributed in the hope that it will be useful, * -* but WITHOUT ANY WARRANTY; without even the implied warranty of * -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * -* GNU General Public License (http://www.gnu.org/licenses/gpl.txt) * -* for more details. * -* * -****************************************************************************/ -/**************************************************************************** -History - -$Log: not supported by cvs2svn $ -Revision 1.6 2007/05/08 12:11:58 pietroni -added circle-line intersection - - -****************************************************************************/ - - - -#ifndef __VCGLIB_INTERSECTION_2 -#define __VCGLIB_INTERSECTION_2 -#include -#include -#include -#include -#include -#include -#include - - - -namespace vcg { - /** \addtogroup space */ - /*@{*/ - /** - Function computing the intersection between couple of geometric primitives in - 2 dimension - */ - - /// return true if the algle is convex (right rotation) - template - inline bool Convex(const Point2 & p0,const Point2 & p1,const Point2 & p2) - { - const SCALAR_TYPE EPS= SCALAR_TYPE(1e-8); - return (((p0-p1)^(p2-p1))<=EPS); - } - - ///return if exist the intersection point - ///between 2 lines in a 2d plane - template - inline bool LineLineIntersection(const vcg::Line2 & l0, - const vcg::Line2 & l1, - Point2 &p) - { - const SCALAR_TYPE Eps= SCALAR_TYPE(1e-8); - ///first line - SCALAR_TYPE x1=l0.Origin().X(); - SCALAR_TYPE y1=l0.Origin().Y(); - SCALAR_TYPE x2=x1+l0.Direction().X(); - SCALAR_TYPE y2=y1+l0.Direction().Y(); - - ///second line - SCALAR_TYPE x3=l1.Origin().X(); - SCALAR_TYPE y3=l1.Origin().Y(); - SCALAR_TYPE x4=x3+l1.Direction().X(); - SCALAR_TYPE y4=y3+l1.Direction().Y(); - - ///then find intersection - - ///denominator - SCALAR_TYPE den=((x1-x2)*(y3-y4))-((y1-y2)*(x3-x4)); - if (fabs(den) - inline bool RayLineIntersection(const vcg::Line2 & l, - const vcg::Ray2 & r, - Point2 &p) - { - ///construct line from ray - vcg::Line2 l_test; - l_test.Set(r.Origin(),r.Direction()); - if (!LineLineIntersection(l,l_test,p)) - return false; - Point2 dir=p-r.Origin(); - dir.Normalize(); - return (dir*r.Direction()>0); - } - - - /// interseciton between point and triangle - template - inline bool RaySegmentIntersection(const vcg::Ray2 & r, - const vcg::Segment2 &seg, - Point2 &p_inters) - { - ///first compute intersection between lines - vcg::Line2 line2; - line2.SetOrigin(seg.P0()); - vcg::Point2 dir=seg.P1()-seg.P0(); - dir.Normalize(); - line2.SetDirection(dir); - if(!RayLineIntersection(line2,r,p_inters)) - return false; - ///then test if intersection point is nearest - ///to both extremes then length of the segment - SCALAR_TYPE d0=(seg.P1()-p_inters).Norm(); - SCALAR_TYPE d1=(seg.P0()-p_inters).Norm(); - SCALAR_TYPE length=(seg.P0()-seg.P1()).Norm(); - return ((d0 - inline bool RayBoxIntersection(const vcg::Ray2 & r, - const vcg::Box2 &bbox, - Point2 &p_inters) - { - ///first create the 4 segments - vcg::Segment2 S[4]; - for (int i=0;i<4;i++) - S[i]=vcg::Segment2(bbox.P(i),bbox.P((i+1)%4)); - - SCALAR_TYPE mind=std::numeric_limits::max(); - bool found=false; - for (int i=0;i<4;i++) - { - Point2 p_inters_test; - if (!RaySegmentIntersection(r,S[i],p_inters_test))continue; - SCALAR_TYPE Norm=(p_inters_test-r.Origin()).Norm(); - if (Norm - inline bool LineSegmentIntersection(const vcg::Line2 & line, - const vcg::Segment2 &seg, - Point2 &p_inters) - { - ///first compute intersection between lines - vcg::Line2 line2; - line2.SetOrigin(seg.P0()); - vcg::Point2 dir=seg.P1()-seg.P0(); - dir.Normalize(); - line2.SetDirection(dir); - if(!LineLineIntersection(line,line2,p_inters)) - return false; - ///then test if intersection point is nearest - ///to both extremes then length of the segment - SCALAR_TYPE d0=(seg.P1()-p_inters).Norm(); - SCALAR_TYPE d1=(seg.P0()-p_inters).Norm(); - SCALAR_TYPE length=(seg.P0()-seg.P1()).Norm(); - return ((d0 - inline bool SegmentSegmentIntersection(const vcg::Segment2 &seg0, - const vcg::Segment2 &seg1, - Point2 &p_inters) - { - const SCALAR_TYPE Eps= SCALAR_TYPE(1e-8); - SCALAR_TYPE lambda0,lambda1; - const Point2 & p0 = seg0.P0(); - const Point2 & p1 = seg0.P1(); - const Point2 & p2 = seg1.P0(); - const Point2 & p3 = seg1.P1(); - - SCALAR_TYPE a = (p1-p0)[0]; - SCALAR_TYPE b = (p2-p3)[0]; - SCALAR_TYPE c = (p1-p0)[1]; - SCALAR_TYPE d = (p2-p3)[1]; - - SCALAR_TYPE e = (p2-p0)[0]; - SCALAR_TYPE f = (p2-p0)[1]; - - SCALAR_TYPE det = a*d-b*c; - - lambda0 = (d*e-b*f)/det; - lambda1 = (-c*e+a*f)/det; - if (fabs(det)= 0.0 && lambda0 <= 1.0 && lambda1 >= 0.0 && lambda1 <= 1.0)) - return false; - p_inters = p0*(1-lambda0)+p1*lambda0; - return true; - } - /// interseciton between point and triangle - template - inline bool IsInsideTrianglePoint( const Triangle2 & t,const Point2 & p) - { - Point2 p0=t.P0(0); - Point2 p1=t.P0(1); - Point2 p2=t.P0(2); - - ///first test with bounding box - vcg::Box2 b2d; - b2d.Add(p0); - b2d.Add(p1); - b2d.Add(p2); - if (!b2d.IsIn(p)) - return false; - - ///then text convex - if (!Convex(p0,p1,p2)) - std::swap >(p1,p2); - return((Convex(p,p0,p1))&&(Convex(p,p1,p2))&&(Convex(p,p2,p0))); - //return((Convex(p,p0,p1))&&(Convex(p,p1,p2))&&(Convex(p,p2,p0))); - } - - template - bool TriangleTriangleIntersect2D(const vcg::Triangle2 &tr0, - const vcg::Triangle2 &tr1) - { - ///test BBox Intersection - vcg::Box2 bbtr0; - bbtr0.Add(tr0.P(0)); - bbtr0.Add(tr0.P(1)); - bbtr0.Add(tr0.P(2)); - vcg::Box2 bbtr1; - bbtr1.Add(tr1.P(0)); - bbtr1.Add(tr1.P(1)); - bbtr1.Add(tr1.P(2)); - if (!bbtr0.Collide(bbtr1)) return false; - ///test vertex in face - for (int i=0;i<3;i++) - { - bool inside0=vcg::IsInsideTrianglePoint(tr0,tr1.P(i)); - bool inside1=vcg::IsInsideTrianglePoint(tr1,tr0.P(i)); - if (inside0 || inside1) return true; - } - ///test segment - ///to segment intersection - for (int i=0;i<3;i++) - { - for (int j=0;j<3;j++) - { - if (i>j) continue; - vcg::Segment2 seg0=vcg::Segment2(tr0.P(i),tr0.P((i+1)%3)); - vcg::Segment2 seg1=vcg::Segment2(tr1.P(j),tr1.P((j+1)%3)); - vcg::Point2 p_inters; - bool intersect=SegmentSegmentIntersection(seg0,seg1,p_inters); - if (intersect) return true; - } - } - return false; - } - - template - bool PointInsidePolygon(vcg::Point2 p, - const std::vector > &polygon) - { - int n=polygon.size(); - vcg::Box2 BB; - for (int i=0;i r; - vcg::Point2 direct=vcg::Point2(0,0); - switch (dir) - { - case 0 : direct.X()=1;break; - case 1 : direct.Y()=1;break; - case 2 : direct.X()=-1; break; - default :direct.Y()=-1; - } - r.SetOrigin(p); - r.SetDirection(direct); - for (int i=0;i p_inters; - if (vcg::RaySegmentIntersection(r,polygon[i],p_inters))intersection++; - } - if ((intersection%2)==1) - inside_test++; - } - return(inside_test>2); - } - - //intersection between a circle and a line - template - inline bool CircleLineIntersection(const vcg::Line2 & line, - const vcg::Point2 ¢er, - const ScalarType &radius, - vcg::Point2 &p0, - vcg::Point2 &p1) - { - ///translate with origin on the center - ScalarType x1,x2,y1,y2; - x1=line.Origin().X()-center.X(); - y1=line.Origin().Y()-center.Y(); - x2=x1+line.Direction().X(); - y2=y1+line.Direction().Y(); - - ScalarType dx,dy,dr,D,delta,sign; - dx=x2-x1; - dy=y2-y1; - dr=sqrt(dx*dx+dy*dy); - D=x1*y2-x2*y1; - delta=radius*radius*dr*dr-D*D; - if (dy>=0) - sign=1; - else - sign=-1; - - if (delta<0.000001) - return false;///no intersection - else - { - p0.X()=(D*dy+sign*dx*sqrt(delta))/dr*dr; - p0.Y()=(-D*dx+fabs(dy)*sqrt(delta))/dr*dr; - p1.X()=(D*dy-sign*dx*sqrt(delta))/dr*dr; - p1.Y()=(-D*dx-fabs(dy)*sqrt(delta))/dr*dr; - p0+=center; - p1+=center; - return true; - } - } - - - // Ray-Segment Functor - class RaySegmentIntersectionFunctor { - public: - - template - inline bool operator () (const SEGMENTTYPE & S, - const Ray2 & ray, - SCALARTYPE & t) - { - typedef SCALARTYPE ScalarType; - typedef vcg::Point2 CoordType; - - CoordType inters_test; - bool bret = RaySegmentIntersection(ray,S, inters_test); - if (bret) - t=(inters_test-ray.Origin()).Norm(); - return (bret); - } - }; - - /*@}*/ -} // end namespace -#endif +/**************************************************************************** +* VCGLib o o * +* Visual and Computer Graphics Library o o * +* _ O _ * +* Copyright(C) 2004-2016 \/)\/ * +* Visual Computing Lab /\/| * +* ISTI - Italian National Research Council | * +* \ * +* All rights reserved. * +* * +* This program is free software; you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published by * +* the Free Software Foundation; either version 2 of the License, or * +* (at your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License (http://www.gnu.org/licenses/gpl.txt) * +* for more details. * +* * +****************************************************************************/ +/**************************************************************************** +History + +$Log: not supported by cvs2svn $ +Revision 1.6 2007/05/08 12:11:58 pietroni +added circle-line intersection + + +****************************************************************************/ + + + +#ifndef __VCGLIB_INTERSECTION_2 +#define __VCGLIB_INTERSECTION_2 +#include +#include +#include +#include +#include +#include +#include + + + +namespace vcg { + /** \addtogroup space */ + /*@{*/ + /** + Function computing the intersection between couple of geometric primitives in + 2 dimension + */ + + /// return true if the algle is convex (right rotation) + template + inline bool Convex(const Point2 & p0,const Point2 & p1,const Point2 & p2) + { + const SCALAR_TYPE EPS= SCALAR_TYPE(1e-8); + return (((p0-p1)^(p2-p1))<=EPS); + } + + ///return if exist the intersection point + ///between 2 lines in a 2d plane + template + inline bool LineLineIntersection(const vcg::Line2 & l0, + const vcg::Line2 & l1, + Point2 &p) + { + const SCALAR_TYPE Eps= SCALAR_TYPE(1e-8); + ///first line + SCALAR_TYPE x1=l0.Origin().X(); + SCALAR_TYPE y1=l0.Origin().Y(); + SCALAR_TYPE x2=x1+l0.Direction().X(); + SCALAR_TYPE y2=y1+l0.Direction().Y(); + + ///second line + SCALAR_TYPE x3=l1.Origin().X(); + SCALAR_TYPE y3=l1.Origin().Y(); + SCALAR_TYPE x4=x3+l1.Direction().X(); + SCALAR_TYPE y4=y3+l1.Direction().Y(); + + ///then find intersection + + ///denominator + SCALAR_TYPE den=((x1-x2)*(y3-y4))-((y1-y2)*(x3-x4)); + if (fabs(den) + inline bool RayLineIntersection(const vcg::Line2 & l, + const vcg::Ray2 & r, + Point2 &p) + { + ///construct line from ray + vcg::Line2 l_test; + l_test.Set(r.Origin(),r.Direction()); + if (!LineLineIntersection(l,l_test,p)) + return false; + Point2 dir=p-r.Origin(); + dir.Normalize(); + return (dir*r.Direction()>0); + } + + + /// interseciton between point and triangle + template + inline bool RaySegmentIntersection(const vcg::Ray2 & r, + const vcg::Segment2 &seg, + Point2 &p_inters) + { + ///first compute intersection between lines + vcg::Line2 line2; + line2.SetOrigin(seg.P0()); + vcg::Point2 dir=seg.P1()-seg.P0(); + dir.Normalize(); + line2.SetDirection(dir); + if(!RayLineIntersection(line2,r,p_inters)) + return false; + ///then test if intersection point is nearest + ///to both extremes then length of the segment + SCALAR_TYPE d0=(seg.P1()-p_inters).Norm(); + SCALAR_TYPE d1=(seg.P0()-p_inters).Norm(); + SCALAR_TYPE length=(seg.P0()-seg.P1()).Norm(); + return ((d0 + inline bool RayBoxIntersection(const vcg::Ray2 & r, + const vcg::Box2 &bbox, + Point2 &p_inters) + { + ///first create the 4 segments + vcg::Segment2 S[4]; + for (int i=0;i<4;i++) + S[i]=vcg::Segment2(bbox.P(i),bbox.P((i+1)%4)); + + SCALAR_TYPE mind=std::numeric_limits::max(); + bool found=false; + for (int i=0;i<4;i++) + { + Point2 p_inters_test; + if (!RaySegmentIntersection(r,S[i],p_inters_test))continue; + SCALAR_TYPE Norm=(p_inters_test-r.Origin()).Norm(); + if (Norm + inline bool LineSegmentIntersection(const vcg::Line2 & line, + const vcg::Segment2 &seg, + Point2 &p_inters) + { + ///first compute intersection between lines + vcg::Line2 line2; + line2.SetOrigin(seg.P0()); + vcg::Point2 dir=seg.P1()-seg.P0(); + dir.Normalize(); + line2.SetDirection(dir); + if(!LineLineIntersection(line,line2,p_inters)) + return false; + ///then test if intersection point is nearest + ///to both extremes then length of the segment + SCALAR_TYPE d0=(seg.P1()-p_inters).Norm(); + SCALAR_TYPE d1=(seg.P0()-p_inters).Norm(); + SCALAR_TYPE length=(seg.P0()-seg.P1()).Norm(); + return ((d0 + inline bool SegmentSegmentIntersection(const vcg::Segment2 &seg0, + const vcg::Segment2 &seg1, + Point2 &p_inters) + { + const SCALAR_TYPE Eps= SCALAR_TYPE(1e-8); + SCALAR_TYPE lambda0,lambda1; + const Point2 & p0 = seg0.P0(); + const Point2 & p1 = seg0.P1(); + const Point2 & p2 = seg1.P0(); + const Point2 & p3 = seg1.P1(); + + SCALAR_TYPE a = (p1-p0)[0]; + SCALAR_TYPE b = (p2-p3)[0]; + SCALAR_TYPE c = (p1-p0)[1]; + SCALAR_TYPE d = (p2-p3)[1]; + + SCALAR_TYPE e = (p2-p0)[0]; + SCALAR_TYPE f = (p2-p0)[1]; + + SCALAR_TYPE det = a*d-b*c; + + lambda0 = (d*e-b*f)/det; + lambda1 = (-c*e+a*f)/det; + if (fabs(det)= 0.0 && lambda0 <= 1.0 && lambda1 >= 0.0 && lambda1 <= 1.0)) + return false; + p_inters = p0*(1-lambda0)+p1*lambda0; + return true; + } + /// interseciton between point and triangle + template + inline bool IsInsideTrianglePoint( const Triangle2 & t,const Point2 & p) + { + Point2 p0=t.P0(0); + Point2 p1=t.P0(1); + Point2 p2=t.P0(2); + + ///first test with bounding box + vcg::Box2 b2d; + b2d.Add(p0); + b2d.Add(p1); + b2d.Add(p2); + if (!b2d.IsIn(p)) + return false; + + ///then text convex + if (!Convex(p0,p1,p2)) + std::swap(p1,p2); + return((Convex(p,p0,p1))&&(Convex(p,p1,p2))&&(Convex(p,p2,p0))); + //return((Convex(p,p0,p1))&&(Convex(p,p1,p2))&&(Convex(p,p2,p0))); + } + + template + bool TriangleTriangleIntersect2D(const vcg::Triangle2 &tr0, + const vcg::Triangle2 &tr1) + { + ///test BBox Intersection + vcg::Box2 bbtr0; + bbtr0.Add(tr0.P(0)); + bbtr0.Add(tr0.P(1)); + bbtr0.Add(tr0.P(2)); + vcg::Box2 bbtr1; + bbtr1.Add(tr1.P(0)); + bbtr1.Add(tr1.P(1)); + bbtr1.Add(tr1.P(2)); + if (!bbtr0.Collide(bbtr1)) return false; + ///test vertex in face + for (int i=0;i<3;i++) + { + bool inside0=vcg::IsInsideTrianglePoint(tr0,tr1.P(i)); + bool inside1=vcg::IsInsideTrianglePoint(tr1,tr0.P(i)); + if (inside0 || inside1) return true; + } + ///test segment + ///to segment intersection + for (int i=0;i<3;i++) + { + for (int j=0;j<3;j++) + { + if (i>j) continue; + vcg::Segment2 seg0=vcg::Segment2(tr0.P(i),tr0.P((i+1)%3)); + vcg::Segment2 seg1=vcg::Segment2(tr1.P(j),tr1.P((j+1)%3)); + vcg::Point2 p_inters; + bool intersect=SegmentSegmentIntersection(seg0,seg1,p_inters); + if (intersect) return true; + } + } + return false; + } + + template + bool PointInsidePolygon(vcg::Point2 p, + const std::vector > &polygon) + { + int n=polygon.size(); + vcg::Box2 BB; + for (int i=0;i r; + vcg::Point2 direct=vcg::Point2(0,0); + switch (dir) + { + case 0 : direct.X()=1;break; + case 1 : direct.Y()=1;break; + case 2 : direct.X()=-1; break; + default :direct.Y()=-1; + } + r.SetOrigin(p); + r.SetDirection(direct); + for (int i=0;i p_inters; + if (vcg::RaySegmentIntersection(r,polygon[i],p_inters))intersection++; + } + if ((intersection%2)==1) + inside_test++; + } + return(inside_test>2); + } + + //intersection between a circle and a line + template + inline bool CircleLineIntersection(const vcg::Line2 & line, + const vcg::Point2 ¢er, + const ScalarType &radius, + vcg::Point2 &p0, + vcg::Point2 &p1) + { + ///translate with origin on the center + ScalarType x1,x2,y1,y2; + x1=line.Origin().X()-center.X(); + y1=line.Origin().Y()-center.Y(); + x2=x1+line.Direction().X(); + y2=y1+line.Direction().Y(); + + ScalarType dx,dy,dr,D,delta,sign; + dx=x2-x1; + dy=y2-y1; + dr=sqrt(dx*dx+dy*dy); + D=x1*y2-x2*y1; + delta=radius*radius*dr*dr-D*D; + if (dy>=0) + sign=1; + else + sign=-1; + + if (delta<0.000001) + return false;///no intersection + else + { + p0.X()=(D*dy+sign*dx*sqrt(delta))/dr*dr; + p0.Y()=(-D*dx+fabs(dy)*sqrt(delta))/dr*dr; + p1.X()=(D*dy-sign*dx*sqrt(delta))/dr*dr; + p1.Y()=(-D*dx-fabs(dy)*sqrt(delta))/dr*dr; + p0+=center; + p1+=center; + return true; + } + } + + + // Ray-Segment Functor + class RaySegmentIntersectionFunctor { + public: + + template + inline bool operator () (const SEGMENTTYPE & S, + const Ray2 & ray, + SCALARTYPE & t) + { + typedef SCALARTYPE ScalarType; + typedef vcg::Point2 CoordType; + + CoordType inters_test; + bool bret = RaySegmentIntersection(ray,S, inters_test); + if (bret) + t=(inters_test-ray.Origin()).Norm(); + return (bret); + } + }; + + /*@}*/ +} // end namespace +#endif From d3d496848dcdae95420989fc7ae1e56c57172b40 Mon Sep 17 00:00:00 2001 From: alemuntoni Date: Mon, 13 Dec 2021 15:01:19 +0100 Subject: [PATCH 35/38] add unsupported eigen package --- eigenlib/howto.txt | 4 +- eigenlib/unsupported/Eigen/AdolcForward | 156 ++ eigenlib/unsupported/Eigen/AlignedVector3 | 224 +++ eigenlib/unsupported/Eigen/ArpackSupport | 31 + eigenlib/unsupported/Eigen/AutoDiff | 40 + eigenlib/unsupported/Eigen/BVH | 95 + eigenlib/unsupported/Eigen/CMakeLists.txt | 32 + .../unsupported/Eigen/CXX11/CMakeLists.txt | 8 + eigenlib/unsupported/Eigen/CXX11/Tensor | 154 ++ .../unsupported/Eigen/CXX11/TensorSymmetry | 42 + eigenlib/unsupported/Eigen/CXX11/ThreadPool | 65 + .../Eigen/CXX11/src/Tensor/README.md | 1760 +++++++++++++++++ .../Eigen/CXX11/src/Tensor/Tensor.h | 527 +++++ .../Eigen/CXX11/src/Tensor/TensorArgMax.h | 299 +++ .../Eigen/CXX11/src/Tensor/TensorAssign.h | 181 ++ .../Eigen/CXX11/src/Tensor/TensorBase.h | 1012 ++++++++++ .../CXX11/src/Tensor/TensorBroadcasting.h | 392 ++++ .../Eigen/CXX11/src/Tensor/TensorChipping.h | 384 ++++ .../CXX11/src/Tensor/TensorConcatenation.h | 361 ++++ .../CXX11/src/Tensor/TensorContraction.h | 628 ++++++ .../src/Tensor/TensorContractionBlocking.h | 56 + .../CXX11/src/Tensor/TensorContractionCuda.h | 1391 +++++++++++++ .../src/Tensor/TensorContractionMapper.h | 469 +++++ .../src/Tensor/TensorContractionThreadPool.h | 1043 ++++++++++ .../Eigen/CXX11/src/Tensor/TensorConversion.h | 279 +++ .../CXX11/src/Tensor/TensorConvolution.h | 1104 +++++++++++ .../Eigen/CXX11/src/Tensor/TensorCostModel.h | 212 ++ .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 313 +++ .../Eigen/CXX11/src/Tensor/TensorDevice.h | 68 + .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 337 ++++ .../CXX11/src/Tensor/TensorDeviceDefault.h | 81 + .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 122 ++ .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 282 +++ .../CXX11/src/Tensor/TensorDimensionList.h | 236 +++ .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 428 ++++ .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 181 ++ .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 633 ++++++ .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 288 +++ .../Eigen/CXX11/src/Tensor/TensorExpr.h | 371 ++++ .../Eigen/CXX11/src/Tensor/TensorFFT.h | 651 ++++++ .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 389 ++++ .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 169 ++ .../src/Tensor/TensorForwardDeclarations.h | 109 + .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 489 +++++ .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 185 ++ .../CXX11/src/Tensor/TensorGlobalFunctions.h | 33 + .../Eigen/CXX11/src/Tensor/TensorIO.h | 79 + .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 509 +++++ .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 725 +++++++ .../Eigen/CXX11/src/Tensor/TensorInflation.h | 229 +++ .../CXX11/src/Tensor/TensorInitializer.h | 82 + .../Eigen/CXX11/src/Tensor/TensorIntDiv.h | 253 +++ .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 209 ++ .../Eigen/CXX11/src/Tensor/TensorMacros.h | 54 + .../Eigen/CXX11/src/Tensor/TensorMap.h | 323 +++ .../Eigen/CXX11/src/Tensor/TensorMeta.h | 218 ++ .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 888 +++++++++ .../Eigen/CXX11/src/Tensor/TensorPadding.h | 397 ++++ .../Eigen/CXX11/src/Tensor/TensorPatch.h | 269 +++ .../Eigen/CXX11/src/Tensor/TensorRandom.h | 276 +++ .../Eigen/CXX11/src/Tensor/TensorReduction.h | 781 ++++++++ .../CXX11/src/Tensor/TensorReductionCuda.h | 750 +++++++ .../CXX11/src/Tensor/TensorReductionSycl.h | 242 +++ .../Eigen/CXX11/src/Tensor/TensorRef.h | 429 ++++ .../Eigen/CXX11/src/Tensor/TensorReverse.h | 288 +++ .../Eigen/CXX11/src/Tensor/TensorScan.h | 287 +++ .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 264 +++ .../Eigen/CXX11/src/Tensor/TensorStorage.h | 146 ++ .../Eigen/CXX11/src/Tensor/TensorStriding.h | 338 ++++ .../Eigen/CXX11/src/Tensor/TensorSycl.h | 82 + .../TensorSyclConvertToDeviceExpression.h | 121 ++ .../src/Tensor/TensorSyclExprConstructor.h | 239 +++ .../src/Tensor/TensorSyclExtractAccessor.h | 204 ++ .../src/Tensor/TensorSyclExtractFunctors.h | 177 ++ .../CXX11/src/Tensor/TensorSyclLeafCount.h | 114 ++ .../src/Tensor/TensorSyclPlaceHolderExpr.h | 181 ++ .../Eigen/CXX11/src/Tensor/TensorSyclRun.h | 70 + .../Eigen/CXX11/src/Tensor/TensorSyclTuple.h | 237 +++ .../Eigen/CXX11/src/Tensor/TensorTraits.h | 272 +++ .../Eigen/CXX11/src/Tensor/TensorUInt128.h | 248 +++ .../CXX11/src/Tensor/TensorVolumePatch.h | 608 ++++++ .../src/TensorSymmetry/DynamicSymmetry.h | 293 +++ .../CXX11/src/TensorSymmetry/StaticSymmetry.h | 236 +++ .../Eigen/CXX11/src/TensorSymmetry/Symmetry.h | 338 ++++ .../TensorSymmetry/util/TemplateGroupTheory.h | 669 +++++++ .../Eigen/CXX11/src/ThreadPool/EventCount.h | 233 +++ .../src/ThreadPool/NonBlockingThreadPool.h | 274 +++ .../Eigen/CXX11/src/ThreadPool/RunQueue.h | 210 ++ .../CXX11/src/ThreadPool/SimpleThreadPool.h | 154 ++ .../CXX11/src/ThreadPool/ThreadEnvironment.h | 38 + .../Eigen/CXX11/src/ThreadPool/ThreadLocal.h | 22 + .../src/ThreadPool/ThreadPoolInterface.h | 33 + .../Eigen/CXX11/src/ThreadPool/ThreadYield.h | 20 + .../Eigen/CXX11/src/util/CXX11Meta.h | 542 +++++ .../Eigen/CXX11/src/util/CXX11Workarounds.h | 88 + .../Eigen/CXX11/src/util/EmulateArray.h | 267 +++ .../Eigen/CXX11/src/util/EmulateCXX11Meta.h | 311 +++ .../Eigen/CXX11/src/util/MaxSizeVector.h | 141 ++ eigenlib/unsupported/Eigen/EulerAngles | 43 + eigenlib/unsupported/Eigen/FFT | 419 ++++ eigenlib/unsupported/Eigen/IterativeSolvers | 42 + eigenlib/unsupported/Eigen/KroneckerProduct | 36 + eigenlib/unsupported/Eigen/LevenbergMarquardt | 45 + eigenlib/unsupported/Eigen/MPRealSupport | 209 ++ eigenlib/unsupported/Eigen/MatrixFunctions | 500 +++++ eigenlib/unsupported/Eigen/MoreVectorization | 24 + .../unsupported/Eigen/NonLinearOptimization | 134 ++ eigenlib/unsupported/Eigen/NumericalDiff | 56 + eigenlib/unsupported/Eigen/OpenGLSupport | 322 +++ eigenlib/unsupported/Eigen/Polynomials | 138 ++ eigenlib/unsupported/Eigen/Skyline | 39 + eigenlib/unsupported/Eigen/SparseExtra | 53 + eigenlib/unsupported/Eigen/SpecialFunctions | 63 + eigenlib/unsupported/Eigen/Splines | 31 + .../Eigen/src/AutoDiff/AutoDiffJacobian.h | 108 + .../Eigen/src/AutoDiff/AutoDiffScalar.h | 720 +++++++ .../Eigen/src/AutoDiff/AutoDiffVector.h | 220 +++ .../unsupported/Eigen/src/BVH/BVAlgorithms.h | 293 +++ eigenlib/unsupported/Eigen/src/BVH/KdBVH.h | 223 +++ .../ArpackSelfAdjointEigenSolver.h | 790 ++++++++ .../Eigen/src/EulerAngles/CMakeLists.txt | 6 + .../Eigen/src/EulerAngles/EulerAngles.h | 386 ++++ .../Eigen/src/EulerAngles/EulerSystem.h | 326 +++ .../unsupported/Eigen/src/FFT/ei_fftw_impl.h | 263 +++ .../Eigen/src/FFT/ei_kissfft_impl.h | 420 ++++ .../IterativeSolvers/ConstrainedConjGrad.h | 189 ++ .../Eigen/src/IterativeSolvers/DGMRES.h | 510 +++++ .../Eigen/src/IterativeSolvers/GMRES.h | 343 ++++ .../Eigen/src/IterativeSolvers/IncompleteLU.h | 90 + .../IterativeSolvers/IterationController.h | 154 ++ .../Eigen/src/IterativeSolvers/MINRES.h | 289 +++ .../Eigen/src/IterativeSolvers/Scaling.h | 187 ++ .../KroneckerProduct/KroneckerTensorProduct.h | 305 +++ .../LevenbergMarquardt/CopyrightMINPACK.txt | 52 + .../Eigen/src/LevenbergMarquardt/LMcovar.h | 84 + .../Eigen/src/LevenbergMarquardt/LMonestep.h | 202 ++ .../Eigen/src/LevenbergMarquardt/LMpar.h | 160 ++ .../Eigen/src/LevenbergMarquardt/LMqrsolv.h | 188 ++ .../LevenbergMarquardt/LevenbergMarquardt.h | 396 ++++ .../src/MatrixFunctions/MatrixExponential.h | 442 +++++ .../src/MatrixFunctions/MatrixFunction.h | 580 ++++++ .../src/MatrixFunctions/MatrixLogarithm.h | 373 ++++ .../Eigen/src/MatrixFunctions/MatrixPower.h | 709 +++++++ .../src/MatrixFunctions/MatrixSquareRoot.h | 368 ++++ .../Eigen/src/MatrixFunctions/StemFunction.h | 117 ++ .../src/MoreVectorization/MathFunctions.h | 95 + .../HybridNonLinearSolver.h | 601 ++++++ .../LevenbergMarquardt.h | 657 ++++++ .../Eigen/src/NonLinearOptimization/chkder.h | 66 + .../Eigen/src/NonLinearOptimization/covar.h | 70 + .../Eigen/src/NonLinearOptimization/dogleg.h | 107 + .../Eigen/src/NonLinearOptimization/fdjac1.h | 79 + .../Eigen/src/NonLinearOptimization/lmpar.h | 298 +++ .../Eigen/src/NonLinearOptimization/qrsolv.h | 91 + .../Eigen/src/NonLinearOptimization/r1mpyq.h | 30 + .../Eigen/src/NonLinearOptimization/r1updt.h | 99 + .../Eigen/src/NonLinearOptimization/rwupdt.h | 49 + .../Eigen/src/NumericalDiff/NumericalDiff.h | 130 ++ .../Eigen/src/Polynomials/Companion.h | 275 +++ .../Eigen/src/Polynomials/PolynomialSolver.h | 428 ++++ .../Eigen/src/Polynomials/PolynomialUtils.h | 143 ++ .../Eigen/src/Skyline/SkylineInplaceLU.h | 352 ++++ .../Eigen/src/Skyline/SkylineMatrix.h | 862 ++++++++ .../Eigen/src/Skyline/SkylineMatrixBase.h | 212 ++ .../Eigen/src/Skyline/SkylineProduct.h | 295 +++ .../Eigen/src/Skyline/SkylineStorage.h | 259 +++ .../Eigen/src/Skyline/SkylineUtil.h | 89 + .../SparseExtra/BlockOfDynamicSparseMatrix.h | 122 ++ .../Eigen/src/SparseExtra/BlockSparseMatrix.h | 1079 ++++++++++ .../src/SparseExtra/DynamicSparseMatrix.h | 404 ++++ .../Eigen/src/SparseExtra/MarketIO.h | 275 +++ .../src/SparseExtra/MatrixMarketIterator.h | 247 +++ .../Eigen/src/SparseExtra/RandomSetter.h | 327 +++ .../SpecialFunctionsArrayAPI.h | 124 ++ .../SpecialFunctionsFunctors.h | 236 +++ .../SpecialFunctions/SpecialFunctionsHalf.h | 47 + .../SpecialFunctions/SpecialFunctionsImpl.h | 1565 +++++++++++++++ .../SpecialFunctionsPacketMath.h | 58 + .../arch/CUDA/CudaSpecialFunctions.h | 165 ++ .../unsupported/Eigen/src/Splines/Spline.h | 507 +++++ .../Eigen/src/Splines/SplineFitting.h | 430 ++++ .../unsupported/Eigen/src/Splines/SplineFwd.h | 93 + eigenlib/unsupported/README.txt | 50 + 183 files changed, 53433 insertions(+), 2 deletions(-) create mode 100644 eigenlib/unsupported/Eigen/AdolcForward create mode 100644 eigenlib/unsupported/Eigen/AlignedVector3 create mode 100644 eigenlib/unsupported/Eigen/ArpackSupport create mode 100644 eigenlib/unsupported/Eigen/AutoDiff create mode 100644 eigenlib/unsupported/Eigen/BVH create mode 100644 eigenlib/unsupported/Eigen/CMakeLists.txt create mode 100644 eigenlib/unsupported/Eigen/CXX11/CMakeLists.txt create mode 100644 eigenlib/unsupported/Eigen/CXX11/Tensor create mode 100644 eigenlib/unsupported/Eigen/CXX11/TensorSymmetry create mode 100644 eigenlib/unsupported/Eigen/CXX11/ThreadPool create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/README.md create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/Tensor.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/util/CXX11Meta.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/util/EmulateArray.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h create mode 100644 eigenlib/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h create mode 100644 eigenlib/unsupported/Eigen/EulerAngles create mode 100644 eigenlib/unsupported/Eigen/FFT create mode 100644 eigenlib/unsupported/Eigen/IterativeSolvers create mode 100644 eigenlib/unsupported/Eigen/KroneckerProduct create mode 100644 eigenlib/unsupported/Eigen/LevenbergMarquardt create mode 100644 eigenlib/unsupported/Eigen/MPRealSupport create mode 100644 eigenlib/unsupported/Eigen/MatrixFunctions create mode 100644 eigenlib/unsupported/Eigen/MoreVectorization create mode 100644 eigenlib/unsupported/Eigen/NonLinearOptimization create mode 100644 eigenlib/unsupported/Eigen/NumericalDiff create mode 100644 eigenlib/unsupported/Eigen/OpenGLSupport create mode 100644 eigenlib/unsupported/Eigen/Polynomials create mode 100644 eigenlib/unsupported/Eigen/Skyline create mode 100644 eigenlib/unsupported/Eigen/SparseExtra create mode 100644 eigenlib/unsupported/Eigen/SpecialFunctions create mode 100644 eigenlib/unsupported/Eigen/Splines create mode 100644 eigenlib/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h create mode 100755 eigenlib/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h create mode 100644 eigenlib/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h create mode 100644 eigenlib/unsupported/Eigen/src/BVH/BVAlgorithms.h create mode 100644 eigenlib/unsupported/Eigen/src/BVH/KdBVH.h create mode 100644 eigenlib/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h create mode 100644 eigenlib/unsupported/Eigen/src/EulerAngles/CMakeLists.txt create mode 100644 eigenlib/unsupported/Eigen/src/EulerAngles/EulerAngles.h create mode 100644 eigenlib/unsupported/Eigen/src/EulerAngles/EulerSystem.h create mode 100644 eigenlib/unsupported/Eigen/src/FFT/ei_fftw_impl.h create mode 100644 eigenlib/unsupported/Eigen/src/FFT/ei_kissfft_impl.h create mode 100644 eigenlib/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h create mode 100644 eigenlib/unsupported/Eigen/src/IterativeSolvers/DGMRES.h create mode 100644 eigenlib/unsupported/Eigen/src/IterativeSolvers/GMRES.h create mode 100644 eigenlib/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h create mode 100644 eigenlib/unsupported/Eigen/src/IterativeSolvers/IterationController.h create mode 100644 eigenlib/unsupported/Eigen/src/IterativeSolvers/MINRES.h create mode 100644 eigenlib/unsupported/Eigen/src/IterativeSolvers/Scaling.h create mode 100644 eigenlib/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h create mode 100644 eigenlib/unsupported/Eigen/src/LevenbergMarquardt/CopyrightMINPACK.txt create mode 100644 eigenlib/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h create mode 100644 eigenlib/unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h create mode 100644 eigenlib/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h create mode 100644 eigenlib/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h create mode 100644 eigenlib/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h create mode 100644 eigenlib/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h create mode 100644 eigenlib/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h create mode 100644 eigenlib/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h create mode 100644 eigenlib/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h create mode 100644 eigenlib/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h create mode 100644 eigenlib/unsupported/Eigen/src/MatrixFunctions/StemFunction.h create mode 100644 eigenlib/unsupported/Eigen/src/MoreVectorization/MathFunctions.h create mode 100644 eigenlib/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h create mode 100644 eigenlib/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h create mode 100644 eigenlib/unsupported/Eigen/src/NonLinearOptimization/chkder.h create mode 100644 eigenlib/unsupported/Eigen/src/NonLinearOptimization/covar.h create mode 100644 eigenlib/unsupported/Eigen/src/NonLinearOptimization/dogleg.h create mode 100644 eigenlib/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h create mode 100644 eigenlib/unsupported/Eigen/src/NonLinearOptimization/lmpar.h create mode 100644 eigenlib/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h create mode 100644 eigenlib/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h create mode 100644 eigenlib/unsupported/Eigen/src/NonLinearOptimization/r1updt.h create mode 100644 eigenlib/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h create mode 100644 eigenlib/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h create mode 100644 eigenlib/unsupported/Eigen/src/Polynomials/Companion.h create mode 100644 eigenlib/unsupported/Eigen/src/Polynomials/PolynomialSolver.h create mode 100644 eigenlib/unsupported/Eigen/src/Polynomials/PolynomialUtils.h create mode 100644 eigenlib/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h create mode 100644 eigenlib/unsupported/Eigen/src/Skyline/SkylineMatrix.h create mode 100644 eigenlib/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h create mode 100644 eigenlib/unsupported/Eigen/src/Skyline/SkylineProduct.h create mode 100644 eigenlib/unsupported/Eigen/src/Skyline/SkylineStorage.h create mode 100644 eigenlib/unsupported/Eigen/src/Skyline/SkylineUtil.h create mode 100644 eigenlib/unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h create mode 100644 eigenlib/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h create mode 100644 eigenlib/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h create mode 100644 eigenlib/unsupported/Eigen/src/SparseExtra/MarketIO.h create mode 100644 eigenlib/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h create mode 100644 eigenlib/unsupported/Eigen/src/SparseExtra/RandomSetter.h create mode 100644 eigenlib/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h create mode 100644 eigenlib/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h create mode 100644 eigenlib/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h create mode 100644 eigenlib/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h create mode 100644 eigenlib/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h create mode 100644 eigenlib/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h create mode 100644 eigenlib/unsupported/Eigen/src/Splines/Spline.h create mode 100644 eigenlib/unsupported/Eigen/src/Splines/SplineFitting.h create mode 100644 eigenlib/unsupported/Eigen/src/Splines/SplineFwd.h create mode 100644 eigenlib/unsupported/README.txt diff --git a/eigenlib/howto.txt b/eigenlib/howto.txt index d88199d5..5a7e5bef 100644 --- a/eigenlib/howto.txt +++ b/eigenlib/howto.txt @@ -8,8 +8,8 @@ Current Eigen Version 3.3.9 (04.12.2020) updated on 15/06/2021 To update the lib: - download Eigen - unzip it somewhere -- delete (in the filesystem) the content of the folder eigenlib/Eigen - copy the folders 'Eigen' there -- execute the two following shell commands in the folder Eigen +- delete (in the filesystem) the content of the folder eigenlib/Eigen - copy the folders 'Eigen' and 'unsupported' there +- execute the two following shell commands in the folder 'Eigen' and 'unsupported' grep -RiIl 'http://mozilla.org/MPL/2.0/.' * | xargs sed -i 's/http:\/\/mozilla.org\/MPL\/2.0\/./the mozilla.org home page/g' grep -RiIl 'http' * | xargs sed -i 's/http/xxxp/g' diff --git a/eigenlib/unsupported/Eigen/AdolcForward b/eigenlib/unsupported/Eigen/AdolcForward new file mode 100644 index 00000000..0390396f --- /dev/null +++ b/eigenlib/unsupported/Eigen/AdolcForward @@ -0,0 +1,156 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_ADLOC_FORWARD +#define EIGEN_ADLOC_FORWARD + +//-------------------------------------------------------------------------------- +// +// This file provides support for adolc's adouble type in forward mode. +// ADOL-C is a C++ automatic differentiation library, +// see xxxps://projects.coin-or.org/ADOL-C for more information. +// +// Note that the maximal number of directions is controlled by +// the preprocessor token NUMBER_DIRECTIONS. The default is 2. +// +//-------------------------------------------------------------------------------- + +#define ADOLC_TAPELESS +#ifndef NUMBER_DIRECTIONS +# define NUMBER_DIRECTIONS 2 +#endif +#include + +// adolc defines some very stupid macros: +#if defined(malloc) +# undef malloc +#endif + +#if defined(calloc) +# undef calloc +#endif + +#if defined(realloc) +# undef realloc +#endif + +#include + +namespace Eigen { + +/** + * \defgroup AdolcForward_Module Adolc forward module + * This module provides support for adolc's adouble type in forward mode. + * ADOL-C is a C++ automatic differentiation library, + * see xxxps://projects.coin-or.org/ADOL-C for more information. + * It mainly consists in: + * - a struct Eigen::NumTraits specialization + * - overloads of internal::* math function for adtl::adouble type. + * + * Note that the maximal number of directions is controlled by + * the preprocessor token NUMBER_DIRECTIONS. The default is 2. + * + * \code + * #include + * \endcode + */ + //@{ + +} // namespace Eigen + +// Eigen's require a few additional functions which must be defined in the same namespace +// than the custom scalar type own namespace +namespace adtl { + +inline const adouble& conj(const adouble& x) { return x; } +inline const adouble& real(const adouble& x) { return x; } +inline adouble imag(const adouble&) { return 0.; } +inline adouble abs(const adouble& x) { return fabs(x); } +inline adouble abs2(const adouble& x) { return x*x; } + +} + +namespace Eigen { + +template<> struct NumTraits + : NumTraits +{ + typedef adtl::adouble Real; + typedef adtl::adouble NonInteger; + typedef adtl::adouble Nested; + enum { + IsComplex = 0, + IsInteger = 0, + IsSigned = 1, + RequireInitialization = 1, + ReadCost = 1, + AddCost = 1, + MulCost = 1 + }; +}; + +template class AdolcForwardJacobian : public Functor +{ + typedef adtl::adouble ActiveScalar; +public: + + AdolcForwardJacobian() : Functor() {} + AdolcForwardJacobian(const Functor& f) : Functor(f) {} + + // forward constructors + template + AdolcForwardJacobian(const T0& a0) : Functor(a0) {} + template + AdolcForwardJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {} + template + AdolcForwardJacobian(const T0& a0, const T1& a1, const T1& a2) : Functor(a0, a1, a2) {} + + typedef typename Functor::InputType InputType; + typedef typename Functor::ValueType ValueType; + typedef typename Functor::JacobianType JacobianType; + + typedef Matrix ActiveInput; + typedef Matrix ActiveValue; + + void operator() (const InputType& x, ValueType* v, JacobianType* _jac) const + { + eigen_assert(v!=0); + if (!_jac) + { + Functor::operator()(x, v); + return; + } + + JacobianType& jac = *_jac; + + ActiveInput ax = x.template cast(); + ActiveValue av(jac.rows()); + + for (int j=0; j +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_ALIGNED_VECTOR3 +#define EIGEN_ALIGNED_VECTOR3 + +#include + +namespace Eigen { + +/** + * \defgroup AlignedVector3_Module Aligned vector3 module + * + * \code + * #include + * \endcode + */ + //@{ + + +/** \class AlignedVector3 + * + * \brief A vectorization friendly 3D vector + * + * This class represents a 3D vector internally using a 4D vector + * such that vectorization can be seamlessly enabled. Of course, + * the same result can be achieved by directly using a 4D vector. + * This class makes this process simpler. + * + */ +// TODO specialize Cwise +template class AlignedVector3; + +namespace internal { +template struct traits > + : traits > +{ +}; +} + +template class AlignedVector3 + : public MatrixBase > +{ + typedef Matrix<_Scalar,4,1> CoeffType; + CoeffType m_coeffs; + public: + + typedef MatrixBase > Base; + EIGEN_DENSE_PUBLIC_INTERFACE(AlignedVector3) + using Base::operator*; + + inline Index rows() const { return 3; } + inline Index cols() const { return 1; } + + Scalar* data() { return m_coeffs.data(); } + const Scalar* data() const { return m_coeffs.data(); } + Index innerStride() const { return 1; } + Index outerStride() const { return 3; } + + inline const Scalar& coeff(Index row, Index col) const + { return m_coeffs.coeff(row, col); } + + inline Scalar& coeffRef(Index row, Index col) + { return m_coeffs.coeffRef(row, col); } + + inline const Scalar& coeff(Index index) const + { return m_coeffs.coeff(index); } + + inline Scalar& coeffRef(Index index) + { return m_coeffs.coeffRef(index);} + + + inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z) + : m_coeffs(x, y, z, Scalar(0)) + {} + + inline AlignedVector3(const AlignedVector3& other) + : Base(), m_coeffs(other.m_coeffs) + {} + + template + struct generic_assign_selector {}; + + template struct generic_assign_selector + { + inline static void run(AlignedVector3& dest, const XprType& src) + { + dest.m_coeffs = src; + } + }; + + template struct generic_assign_selector + { + inline static void run(AlignedVector3& dest, const XprType& src) + { + dest.m_coeffs.template head<3>() = src; + dest.m_coeffs.w() = Scalar(0); + } + }; + + template + inline AlignedVector3(const MatrixBase& other) + { + generic_assign_selector::run(*this,other.derived()); + } + + inline AlignedVector3& operator=(const AlignedVector3& other) + { m_coeffs = other.m_coeffs; return *this; } + + template + inline AlignedVector3& operator=(const MatrixBase& other) + { + generic_assign_selector::run(*this,other.derived()); + return *this; + } + + inline AlignedVector3 operator+(const AlignedVector3& other) const + { return AlignedVector3(m_coeffs + other.m_coeffs); } + + inline AlignedVector3& operator+=(const AlignedVector3& other) + { m_coeffs += other.m_coeffs; return *this; } + + inline AlignedVector3 operator-(const AlignedVector3& other) const + { return AlignedVector3(m_coeffs - other.m_coeffs); } + + inline AlignedVector3 operator-=(const AlignedVector3& other) + { m_coeffs -= other.m_coeffs; return *this; } + + inline AlignedVector3 operator*(const Scalar& s) const + { return AlignedVector3(m_coeffs * s); } + + inline friend AlignedVector3 operator*(const Scalar& s,const AlignedVector3& vec) + { return AlignedVector3(s * vec.m_coeffs); } + + inline AlignedVector3& operator*=(const Scalar& s) + { m_coeffs *= s; return *this; } + + inline AlignedVector3 operator/(const Scalar& s) const + { return AlignedVector3(m_coeffs / s); } + + inline AlignedVector3& operator/=(const Scalar& s) + { m_coeffs /= s; return *this; } + + inline Scalar dot(const AlignedVector3& other) const + { + eigen_assert(m_coeffs.w()==Scalar(0)); + eigen_assert(other.m_coeffs.w()==Scalar(0)); + return m_coeffs.dot(other.m_coeffs); + } + + inline void normalize() + { + m_coeffs /= norm(); + } + + inline AlignedVector3 normalized() const + { + return AlignedVector3(m_coeffs / norm()); + } + + inline Scalar sum() const + { + eigen_assert(m_coeffs.w()==Scalar(0)); + return m_coeffs.sum(); + } + + inline Scalar squaredNorm() const + { + eigen_assert(m_coeffs.w()==Scalar(0)); + return m_coeffs.squaredNorm(); + } + + inline Scalar norm() const + { + using std::sqrt; + return sqrt(squaredNorm()); + } + + inline AlignedVector3 cross(const AlignedVector3& other) const + { + return AlignedVector3(m_coeffs.cross3(other.m_coeffs)); + } + + template + inline bool isApprox(const MatrixBase& other, const RealScalar& eps=NumTraits::dummy_precision()) const + { + return m_coeffs.template head<3>().isApprox(other,eps); + } + + CoeffType& coeffs() { return m_coeffs; } + const CoeffType& coeffs() const { return m_coeffs; } +}; + +namespace internal { + +template +struct eval, Dense> +{ + typedef const AlignedVector3<_Scalar>& type; +}; + +template +struct evaluator > + : evaluator > +{ + typedef AlignedVector3 XprType; + typedef evaluator > Base; + + evaluator(const XprType &m) : Base(m.coeffs()) {} +}; + +} + +//@} + +} + +#endif // EIGEN_ALIGNED_VECTOR3 diff --git a/eigenlib/unsupported/Eigen/ArpackSupport b/eigenlib/unsupported/Eigen/ArpackSupport new file mode 100644 index 00000000..3cc9ff5b --- /dev/null +++ b/eigenlib/unsupported/Eigen/ArpackSupport @@ -0,0 +1,31 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_ARPACKSUPPORT_MODULE_H +#define EIGEN_ARPACKSUPPORT_MODULE_H + +#include + +/** \defgroup ArpackSupport_Module Arpack support module + * + * This module provides a wrapper to Arpack, a library for sparse eigenvalue decomposition. + * + * \code + * #include + * \endcode + */ + +#include + +#include +#include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h" + +#include + +#endif // EIGEN_ARPACKSUPPORT_MODULE_H +/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/eigenlib/unsupported/Eigen/AutoDiff b/eigenlib/unsupported/Eigen/AutoDiff new file mode 100644 index 00000000..84227510 --- /dev/null +++ b/eigenlib/unsupported/Eigen/AutoDiff @@ -0,0 +1,40 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_AUTODIFF_MODULE +#define EIGEN_AUTODIFF_MODULE + +namespace Eigen { + +/** + * \defgroup AutoDiff_Module Auto Diff module + * + * This module features forward automatic differentation via a simple + * templated scalar type wrapper AutoDiffScalar. + * + * Warning : this should NOT be confused with numerical differentiation, which + * is a different method and has its own module in Eigen : \ref NumericalDiff_Module. + * + * \code + * #include + * \endcode + */ +//@{ + +} + +#include "src/AutoDiff/AutoDiffScalar.h" +// #include "src/AutoDiff/AutoDiffVector.h" +#include "src/AutoDiff/AutoDiffJacobian.h" + +namespace Eigen { +//@} +} + +#endif // EIGEN_AUTODIFF_MODULE diff --git a/eigenlib/unsupported/Eigen/BVH b/eigenlib/unsupported/Eigen/BVH new file mode 100644 index 00000000..278fe645 --- /dev/null +++ b/eigenlib/unsupported/Eigen/BVH @@ -0,0 +1,95 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Ilya Baran +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_BVH_MODULE_H +#define EIGEN_BVH_MODULE_H + +#include +#include +#include +#include +#include + +namespace Eigen { + +/** + * \defgroup BVH_Module BVH module + * \brief This module provides generic bounding volume hierarchy algorithms + * and reference tree implementations. + * + * + * \code + * #include + * \endcode + * + * A bounding volume hierarchy (BVH) can accelerate many geometric queries. This module provides a generic implementation + * of the two basic algorithms over a BVH: intersection of a query object against all objects in the hierarchy and minimization + * of a function over the objects in the hierarchy. It also provides intersection and minimization over a cartesian product of + * two BVH's. A BVH accelerates intersection by using the fact that if a query object does not intersect a volume, then it cannot + * intersect any object contained in that volume. Similarly, a BVH accelerates minimization because the minimum of a function + * over a volume is no greater than the minimum of a function over any object contained in it. + * + * Some sample queries that can be written in terms of intersection are: + * - Determine all points where a ray intersects a triangle mesh + * - Given a set of points, determine which are contained in a query sphere + * - Given a set of spheres, determine which contain the query point + * - Given a set of disks, determine if any is completely contained in a query rectangle (represent each 2D disk as a point \f$(x,y,r)\f$ + * in 3D and represent the rectangle as a pyramid based on the original rectangle and shrinking in the \f$r\f$ direction) + * - Given a set of points, count how many pairs are \f$d\pm\epsilon\f$ apart (done by looking at the cartesian product of the set + * of points with itself) + * + * Some sample queries that can be written in terms of function minimization over a set of objects are: + * - Find the intersection between a ray and a triangle mesh closest to the ray origin (function is infinite off the ray) + * - Given a polyline and a query point, determine the closest point on the polyline to the query + * - Find the diameter of a point cloud (done by looking at the cartesian product and using negative distance as the function) + * - Determine how far two meshes are from colliding (this is also a cartesian product query) + * + * This implementation decouples the basic algorithms both from the type of hierarchy (and the types of the bounding volumes) and + * from the particulars of the query. To enable abstraction from the BVH, the BVH is required to implement a generic mechanism + * for traversal. To abstract from the query, the query is responsible for keeping track of results. + * + * To be used in the algorithms, a hierarchy must implement the following traversal mechanism (see KdBVH for a sample implementation): \code + typedef Volume //the type of bounding volume + typedef Object //the type of object in the hierarchy + typedef Index //a reference to a node in the hierarchy--typically an int or a pointer + typedef VolumeIterator //an iterator type over node children--returns Index + typedef ObjectIterator //an iterator over object (leaf) children--returns const Object & + Index getRootIndex() const //returns the index of the hierarchy root + const Volume &getVolume(Index index) const //returns the bounding volume of the node at given index + void getChildren(Index index, VolumeIterator &outVBegin, VolumeIterator &outVEnd, + ObjectIterator &outOBegin, ObjectIterator &outOEnd) const + //getChildren takes a node index and makes [outVBegin, outVEnd) range over its node children + //and [outOBegin, outOEnd) range over its object children + \endcode + * + * To use the hierarchy, call BVIntersect or BVMinimize, passing it a BVH (or two, for cartesian product) and a minimizer or intersector. + * For an intersection query on a single BVH, the intersector encapsulates the query and must provide two functions: + * \code + bool intersectVolume(const Volume &volume) //returns true if the query intersects the volume + bool intersectObject(const Object &object) //returns true if the intersection search should terminate immediately + \endcode + * The guarantee that BVIntersect provides is that intersectObject will be called on every object whose bounding volume + * intersects the query (but possibly on other objects too) unless the search is terminated prematurely. It is the + * responsibility of the intersectObject function to keep track of the results in whatever manner is appropriate. + * The cartesian product intersection and the BVMinimize queries are similar--see their individual documentation. + * + * The following is a simple but complete example for how to use the BVH to accelerate the search for a closest red-blue point pair: + * \include BVH_Example.cpp + * Output: \verbinclude BVH_Example.out + */ +} + +//@{ + +#include "src/BVH/BVAlgorithms.h" +#include "src/BVH/KdBVH.h" + +//@} + +#endif // EIGEN_BVH_MODULE_H diff --git a/eigenlib/unsupported/Eigen/CMakeLists.txt b/eigenlib/unsupported/Eigen/CMakeLists.txt new file mode 100644 index 00000000..631a0601 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CMakeLists.txt @@ -0,0 +1,32 @@ +set(Eigen_HEADERS + AdolcForward + AlignedVector3 + ArpackSupport + AutoDiff + BVH + EulerAngles + FFT + IterativeSolvers + KroneckerProduct + LevenbergMarquardt + MatrixFunctions + MoreVectorization + MPRealSupport + NonLinearOptimization + NumericalDiff + OpenGLSupport + Polynomials + Skyline + SparseExtra + SpecialFunctions + Splines + ) + +install(FILES + ${Eigen_HEADERS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel + ) + +install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h") + +add_subdirectory(CXX11) diff --git a/eigenlib/unsupported/Eigen/CXX11/CMakeLists.txt b/eigenlib/unsupported/Eigen/CXX11/CMakeLists.txt new file mode 100644 index 00000000..385ed240 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/CMakeLists.txt @@ -0,0 +1,8 @@ +set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool) + +install(FILES + ${Eigen_CXX11_HEADERS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel + ) + +install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h") diff --git a/eigenlib/unsupported/Eigen/CXX11/Tensor b/eigenlib/unsupported/Eigen/CXX11/Tensor new file mode 100644 index 00000000..5d88bb5e --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/Tensor @@ -0,0 +1,154 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +//#ifndef EIGEN_CXX11_TENSOR_MODULE +//#define EIGEN_CXX11_TENSOR_MODULE + +#include "../../../Eigen/Core" + +#ifdef EIGEN_USE_SYCL +#undef min +#undef max +#undef isnan +#undef isinf +#undef isfinite +#include +#include +#include +#include +#endif + +#include + +#include "../SpecialFunctions" +#include "src/util/CXX11Meta.h" +#include "src/util/MaxSizeVector.h" + +/** \defgroup CXX11_Tensor_Module Tensor Module + * + * This module provides a Tensor class for storing arbitrarily indexed + * objects. + * + * \code + * #include + * \endcode + * + * Much of the documentation can be found \ref eigen_tensors "here". + */ + +#include +#include +#include + +#ifdef _WIN32 +typedef __int16 int16_t; +typedef unsigned __int16 uint16_t; +typedef __int32 int32_t; +typedef unsigned __int32 uint32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#else +#include +#endif + +#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900 +#include +#endif + +#ifdef _WIN32 +#include +#elif defined(__APPLE__) +#include +#else +#include +#endif + +#ifdef EIGEN_USE_THREADS +#include "ThreadPool" +#endif + +#ifdef EIGEN_USE_GPU +#include +#include +#if __cplusplus >= 201103L +#include +#include +#endif +#endif + +#include "src/Tensor/TensorMacros.h" +#include "src/Tensor/TensorForwardDeclarations.h" +#include "src/Tensor/TensorMeta.h" +#include "src/Tensor/TensorFunctors.h" +#include "src/Tensor/TensorCostModel.h" +#include "src/Tensor/TensorDeviceDefault.h" +#include "src/Tensor/TensorDeviceThreadPool.h" +#include "src/Tensor/TensorDeviceCuda.h" +#include "src/Tensor/TensorDeviceSycl.h" +#include "src/Tensor/TensorIndexList.h" +#include "src/Tensor/TensorDimensionList.h" +#include "src/Tensor/TensorDimensions.h" +#include "src/Tensor/TensorInitializer.h" +#include "src/Tensor/TensorTraits.h" +#include "src/Tensor/TensorRandom.h" +#include "src/Tensor/TensorUInt128.h" +#include "src/Tensor/TensorIntDiv.h" +#include "src/Tensor/TensorGlobalFunctions.h" + +#include "src/Tensor/TensorBase.h" + +#include "src/Tensor/TensorEvaluator.h" +#include "src/Tensor/TensorExpr.h" +#include "src/Tensor/TensorReduction.h" +#include "src/Tensor/TensorReductionCuda.h" +#include "src/Tensor/TensorArgMax.h" +#include "src/Tensor/TensorConcatenation.h" +#include "src/Tensor/TensorContractionMapper.h" +#include "src/Tensor/TensorContractionBlocking.h" +#include "src/Tensor/TensorContraction.h" +#include "src/Tensor/TensorContractionThreadPool.h" +#include "src/Tensor/TensorContractionCuda.h" +#include "src/Tensor/TensorConversion.h" +#include "src/Tensor/TensorConvolution.h" +#include "src/Tensor/TensorFFT.h" +#include "src/Tensor/TensorPatch.h" +#include "src/Tensor/TensorImagePatch.h" +#include "src/Tensor/TensorVolumePatch.h" +#include "src/Tensor/TensorBroadcasting.h" +#include "src/Tensor/TensorChipping.h" +#include "src/Tensor/TensorInflation.h" +#include "src/Tensor/TensorLayoutSwap.h" +#include "src/Tensor/TensorMorphing.h" +#include "src/Tensor/TensorPadding.h" +#include "src/Tensor/TensorReverse.h" +#include "src/Tensor/TensorShuffling.h" +#include "src/Tensor/TensorStriding.h" +#include "src/Tensor/TensorCustomOp.h" +#include "src/Tensor/TensorEvalTo.h" +#include "src/Tensor/TensorForcedEval.h" +#include "src/Tensor/TensorGenerator.h" +#include "src/Tensor/TensorAssign.h" +#include "src/Tensor/TensorScan.h" + +#include "src/Tensor/TensorSycl.h" +#include "src/Tensor/TensorExecutor.h" +#include "src/Tensor/TensorDevice.h" + +#include "src/Tensor/TensorStorage.h" +#include "src/Tensor/Tensor.h" +#include "src/Tensor/TensorFixedSize.h" +#include "src/Tensor/TensorMap.h" +#include "src/Tensor/TensorRef.h" + +#include "src/Tensor/TensorIO.h" + +#include + +//#endif // EIGEN_CXX11_TENSOR_MODULE diff --git a/eigenlib/unsupported/Eigen/CXX11/TensorSymmetry b/eigenlib/unsupported/Eigen/CXX11/TensorSymmetry new file mode 100644 index 00000000..d39167cf --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/TensorSymmetry @@ -0,0 +1,42 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE +#define EIGEN_CXX11_TENSORSYMMETRY_MODULE + +#include + +#include + +#include "src/util/CXX11Meta.h" + +/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module + * + * This module provides a classes that allow for the definition of + * symmetries w.r.t. tensor indices. + * + * Including this module will implicitly include the Tensor module. + * + * \code + * #include + * \endcode + */ + +#include "src/TensorSymmetry/util/TemplateGroupTheory.h" +#include "src/TensorSymmetry/Symmetry.h" +#include "src/TensorSymmetry/StaticSymmetry.h" +#include "src/TensorSymmetry/DynamicSymmetry.h" + +#include + +#endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/eigenlib/unsupported/Eigen/CXX11/ThreadPool b/eigenlib/unsupported/Eigen/CXX11/ThreadPool new file mode 100644 index 00000000..1455b19d --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/ThreadPool @@ -0,0 +1,65 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_THREADPOOL_MODULE +#define EIGEN_CXX11_THREADPOOL_MODULE + +#include "../../../Eigen/Core" + +#include + +/** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module + * + * This module provides 2 threadpool implementations + * - a simple reference implementation + * - a faster non blocking implementation + * + * This module requires C++11. + * + * \code + * #include + * \endcode + */ + + +// The code depends on CXX11, so only include the module if the +// compiler supports it. +#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900 +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "src/util/CXX11Meta.h" +#include "src/util/MaxSizeVector.h" + +#include "src/ThreadPool/ThreadLocal.h" +#include "src/ThreadPool/ThreadYield.h" +#include "src/ThreadPool/EventCount.h" +#include "src/ThreadPool/RunQueue.h" +#include "src/ThreadPool/ThreadPoolInterface.h" +#include "src/ThreadPool/ThreadEnvironment.h" +#include "src/ThreadPool/SimpleThreadPool.h" +#include "src/ThreadPool/NonBlockingThreadPool.h" + +#endif + +#include + +#endif // EIGEN_CXX11_THREADPOOL_MODULE + diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/README.md b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/README.md new file mode 100644 index 00000000..da70fa21 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -0,0 +1,1760 @@ +# Eigen Tensors {#eigen_tensors} + +Tensors are multidimensional arrays of elements. Elements are typically scalars, +but more complex types such as strings are also supported. + +[TOC] + +## Tensor Classes + +You can manipulate a tensor with one of the following classes. They all are in +the namespace `::Eigen.` + + +### Class Tensor + +This is the class to use to create a tensor and allocate memory for it. The +class is templatized with the tensor datatype, such as float or int, and the +tensor rank. The rank is the number of dimensions, for example rank 2 is a +matrix. + +Tensors of this class are resizable. For example, if you assign a tensor of a +different size to a Tensor, that tensor is resized to match its new value. + +#### Constructor `Tensor(size0, size1, ...)` + +Constructor for a Tensor. The constructor must be passed `rank` integers +indicating the sizes of the instance along each of the the `rank` +dimensions. + + // Create a tensor of rank 3 of sizes 2, 3, 4. This tensor owns + // memory to hold 24 floating point values (24 = 2 x 3 x 4). + Tensor t_3d(2, 3, 4); + + // Resize t_3d by assigning a tensor of different sizes, but same rank. + t_3d = Tensor(3, 4, 3); + +#### Constructor `Tensor(size_array)` + +Constructor where the sizes for the constructor are specified as an array of +values instead of an explicitly list of parameters. The array type to use is +`Eigen::array`. The array can be constructed automatically +from an initializer list. + + // Create a tensor of strings of rank 2 with sizes 5, 7. + Tensor t_2d({5, 7}); + + +### Class `TensorFixedSize>` + +Class to use for tensors of fixed size, where the size is known at compile +time. Fixed sized tensors can provide very fast computations because all their +dimensions are known by the compiler. FixedSize tensors are not resizable. + +If the total number of elements in a fixed size tensor is small enough the +tensor data is held onto the stack and does not cause heap allocation and free. + + // Create a 4 x 3 tensor of floats. + TensorFixedSize> t_4x3; + +### Class `TensorMap>` + +This is the class to use to create a tensor on top of memory allocated and +owned by another part of your code. It allows to view any piece of allocated +memory as a Tensor. Instances of this class do not own the memory where the +data are stored. + +A TensorMap is not resizable because it does not own the memory where its data +are stored. + +#### Constructor `TensorMap>(data, size0, size1, ...)` + +Constructor for a Tensor. The constructor must be passed a pointer to the +storage for the data, and "rank" size attributes. The storage has to be +large enough to hold all the data. + + // Map a tensor of ints on top of stack-allocated storage. + int storage[128]; // 2 x 4 x 2 x 8 = 128 + TensorMap> t_4d(storage, 2, 4, 2, 8); + + // The same storage can be viewed as a different tensor. + // You can also pass the sizes as an array. + TensorMap> t_2d(storage, 16, 8); + + // You can also map fixed-size tensors. Here we get a 1d view of + // the 2d fixed-size tensor. + TensorFixedSize> t_4x3; + TensorMap> t_12(t_4x3.data(), 12); + + +#### Class `TensorRef` + +See Assigning to a TensorRef below. + +## Accessing Tensor Elements + +#### ` tensor(index0, index1...)` + +Return the element at position `(index0, index1...)` in tensor +`tensor`. You must pass as many parameters as the rank of `tensor`. +The expression can be used as an l-value to set the value of the element at the +specified position. The value returned is of the datatype of the tensor. + + // Set the value of the element at position (0, 1, 0); + Tensor t_3d(2, 3, 4); + t_3d(0, 1, 0) = 12.0f; + + // Initialize all elements to random values. + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 4; ++k) { + t_3d(i, j, k) = ...some random value...; + } + } + } + + // Print elements of a tensor. + for (int i = 0; i < 2; ++i) { + LOG(INFO) << t_3d(i, 0, 0); + } + + +## TensorLayout + +The tensor library supports 2 layouts: `ColMajor` (the default) and +`RowMajor`. Only the default column major layout is currently fully +supported, and it is therefore not recommended to attempt to use the row major +layout at the moment. + +The layout of a tensor is optionally specified as part of its type. If not +specified explicitly column major is assumed. + + Tensor col_major; // equivalent to Tensor + TensorMap > row_major(data, ...); + +All the arguments to an expression must use the same layout. Attempting to mix +different layouts will result in a compilation error. + +It is possible to change the layout of a tensor or an expression using the +`swap_layout()` method. Note that this will also reverse the order of the +dimensions. + + Tensor col_major(2, 4); + Tensor row_major(2, 4); + + Tensor col_major_result = col_major; // ok, layouts match + Tensor col_major_result = row_major; // will not compile + + // Simple layout swap + col_major_result = row_major.swap_layout(); + eigen_assert(col_major_result.dimension(0) == 4); + eigen_assert(col_major_result.dimension(1) == 2); + + // Swap the layout and preserve the order of the dimensions + array shuffle(1, 0); + col_major_result = row_major.swap_layout().shuffle(shuffle); + eigen_assert(col_major_result.dimension(0) == 2); + eigen_assert(col_major_result.dimension(1) == 4); + + +## Tensor Operations + +The Eigen Tensor library provides a vast library of operations on Tensors: +numerical operations such as addition and multiplication, geometry operations +such as slicing and shuffling, etc. These operations are available as methods +of the Tensor classes, and in some cases as operator overloads. For example +the following code computes the elementwise addition of two tensors: + + Tensor t1(2, 3, 4); + ...set some values in t1... + Tensor t2(2, 3, 4); + ...set some values in t2... + // Set t3 to the element wise sum of t1 and t2 + Tensor t3 = t1 + t2; + +While the code above looks easy enough, it is important to understand that the +expression `t1 + t2` is not actually adding the values of the tensors. The +expression instead constructs a "tensor operator" object of the class +TensorCwiseBinaryOp, which has references to the tensors +`t1` and `t2`. This is a small C++ object that knows how to add +`t1` and `t2`. It is only when the value of the expression is assigned +to the tensor `t3` that the addition is actually performed. Technically, +this happens through the overloading of `operator=()` in the Tensor class. + +This mechanism for computing tensor expressions allows for lazy evaluation and +optimizations which are what make the tensor library very fast. + +Of course, the tensor operators do nest, and the expression `t1 + t2 * 0.3f` +is actually represented with the (approximate) tree of operators: + + TensorCwiseBinaryOp(t1, TensorCwiseUnaryOp(t2, 0.3f)) + + +### Tensor Operations and C++ "auto" + +Because Tensor operations create tensor operators, the C++ `auto` keyword +does not have its intuitive meaning. Consider these 2 lines of code: + + Tensor t3 = t1 + t2; + auto t4 = t1 + t2; + +In the first line we allocate the tensor `t3` and it will contain the +result of the addition of `t1` and `t2`. In the second line, `t4` +is actually the tree of tensor operators that will compute the addition of +`t1` and `t2`. In fact, `t4` is *not* a tensor and you cannot get +the values of its elements: + + Tensor t3 = t1 + t2; + cout << t3(0, 0, 0); // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0) + + auto t4 = t1 + t2; + cout << t4(0, 0, 0); // Compilation error! + +When you use `auto` you do not get a Tensor as a result but instead a +non-evaluated expression. So only use `auto` to delay evaluation. + +Unfortunately, there is no single underlying concrete type for holding +non-evaluated expressions, hence you have to use auto in the case when you do +want to hold non-evaluated expressions. + +When you need the results of set of tensor computations you have to assign the +result to a Tensor that will be capable of holding onto them. This can be +either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing +piece of memory. All the following will work: + + auto t4 = t1 + t2; + + Tensor result = t4; // Could also be: result(t4); + cout << result(0, 0, 0); + + TensorMap result(, , ...) = t4; + cout << result(0, 0, 0); + + TensorFixedSize> result = t4; + cout << result(0, 0, 0); + +Until you need the results, you can keep the operation around, and even reuse +it for additional operations. As long as you keep the expression as an +operation, no computation is performed. + + // One way to compute exp((t1 + t2) * 0.2f); + auto t3 = t1 + t2; + auto t4 = t3 * 0.2f; + auto t5 = t4.exp(); + Tensor result = t5; + + // Another way, exactly as efficient as the previous one: + Tensor result = ((t1 + t2) * 0.2f).exp(); + +### Controlling When Expression are Evaluated + +There are several ways to control when expressions are evaluated: + +* Assignment to a Tensor, TensorFixedSize, or TensorMap. +* Use of the eval() method. +* Assignment to a TensorRef. + +#### Assigning to a Tensor, TensorFixedSize, or TensorMap. + +The most common way to evaluate an expression is to assign it to a Tensor. In +the example below, the `auto` declarations make the intermediate values +"Operations", not Tensors, and do not cause the expressions to be evaluated. +The assignment to the Tensor `result` causes the evaluation of all the +operations. + + auto t3 = t1 + t2; // t3 is an Operation. + auto t4 = t3 * 0.2f; // t4 is an Operation. + auto t5 = t4.exp(); // t5 is an Operation. + Tensor result = t5; // The operations are evaluated. + +If you know the ranks and sizes of the Operation value you can assign the +Operation to a TensorFixedSize instead of a Tensor, which is a bit more +efficient. + + // We know that the result is a 4x4x2 tensor! + TensorFixedSize> result = t5; + +Simiarly, assigning an expression to a TensorMap causes its evaluation. Like +tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to +have the rank and sizes of the expression that are assigned to them. + +#### Calling `eval()`. + +When you compute large composite expressions, you sometimes want to tell Eigen +that an intermediate value in the expression tree is worth evaluating ahead of +time. This is done by inserting a call to the `eval()` method of the +expression Operation. + + // The previous example could have been written: + Tensor result = ((t1 + t2) * 0.2f).exp(); + + // If you want to compute (t1 + t2) once ahead of time you can write: + Tensor result = ((t1 + t2).eval() * 0.2f).exp(); + +Semantically, calling `eval()` is equivalent to materializing the value of +the expression in a temporary Tensor of the right size. The code above in +effect does: + + // .eval() knows the size! + TensorFixedSize> tmp = t1 + t2; + Tensor result = (tmp * 0.2f).exp(); + +Note that the return value of `eval()` is itself an Operation, so the +following code does not do what you may think: + + // Here t3 is an evaluation Operation. t3 has not been evaluated yet. + auto t3 = (t1 + t2).eval(); + + // You can use t3 in another expression. Still no evaluation. + auto t4 = (t3 * 0.2f).exp(); + + // The value is evaluated when you assign the Operation to a Tensor, using + // an intermediate tensor to represent t3.x + Tensor result = t4; + +While in the examples above calling `eval()` does not make a difference in +performance, in other cases it can make a huge difference. In the expression +below the `broadcast()` expression causes the `X.maximum()` expression +to be evaluated many times: + + Tensor<...> X ...; + Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast)) + * beta).exp(); + +Inserting a call to `eval()` between the `maximum()` and +`reshape()` calls guarantees that maximum() is only computed once and +greatly speeds-up execution: + + Tensor<...> Y = + ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) + * beta).exp(); + +In the other example below, the tensor `Y` is both used in the expression +and its assignment. This is an aliasing problem and if the evaluation is not +done in the right order Y will be updated incrementally during the evaluation +resulting in bogus results: + + Tensor<...> Y ...; + Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast)); + +Inserting a call to `eval()` between the `sum()` and `reshape()` +expressions ensures that the sum is computed before any updates to `Y` are +done. + + Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); + +Note that an eval around the full right hand side expression is not needed +because the generated has to compute the i-th value of the right hand side +before assigning it to the left hand side. + +However, if you were assigning the expression value to a shuffle of `Y` +then you would need to force an eval for correctness by adding an `eval()` +call for the right hand side: + + Y.shuffle(...) = + (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval(); + + +#### Assigning to a `TensorRef`. + +If you need to access only a few elements from the value of an expression you +can avoid materializing the value in a full tensor by using a TensorRef. + +A TensorRef is a small wrapper class for any Eigen Operation. It provides +overloads for the `()` operator that let you access individual values in +the expression. TensorRef is convenient, because the Operation themselves do +not provide a way to access individual elements. + + // Create a TensorRef for the expression. The expression is not + // evaluated yet. + TensorRef > ref = ((t1 + t2) * 0.2f).exp(); + + // Use "ref" to access individual elements. The expression is evaluated + // on the fly. + float at_0 = ref(0, 0, 0); + cout << ref(0, 1, 0); + +Only use TensorRef when you need a subset of the values of the expression. +TensorRef only computes the values you access. However note that if you are +going to access all the values it will be much faster to materialize the +results in a Tensor first. + +In some cases, if the full Tensor result would be very large, you may save +memory by accessing it as a TensorRef. But not always. So don't count on it. + + +### Controlling How Expressions Are Evaluated + +The tensor library provides several implementations of the various operations +such as contractions and convolutions. The implementations are optimized for +different environments: single threaded on CPU, multi threaded on CPU, or on a +GPU using cuda. Additional implementations may be added later. + +You can choose which implementation to use with the `device()` call. If +you do not choose an implementation explicitly the default implementation that +uses a single thread on the CPU is used. + +The default implementation has been optimized for recent Intel CPUs, taking +advantage of SSE, AVX, and FMA instructions. Work is ongoing to tune the +library on ARM CPUs. Note that you need to pass compiler-dependent flags +to enable the use of SSE, AVX, and other instructions. + +For example, the following code adds two tensors using the default +single-threaded CPU implementation: + + Tensor a(30, 40); + Tensor b(30, 40); + Tensor c = a + b; + +To choose a different implementation you have to insert a `device()` call +before the assignment of the result. For technical C++ reasons this requires +that the Tensor for the result be declared on its own. This means that you +have to know the size of the result. + + Eigen::Tensor c(30, 40); + c.device(...) = a + b; + +The call to `device()` must be the last call on the left of the operator=. + +You must pass to the `device()` call an Eigen device object. There are +presently three devices you can use: DefaultDevice, ThreadPoolDevice and +GpuDevice. + + +#### Evaluating With the DefaultDevice + +This is exactly the same as not inserting a `device()` call. + + DefaultDevice my_device; + c.device(my_device) = a + b; + +#### Evaluating with a Thread Pool + + // Create the Eigen ThreadPoolDevice. + Eigen::ThreadPoolDevice my_device(4 /* number of threads to use */); + + // Now just use the device when evaluating expressions. + Eigen::Tensor c(30, 50); + c.device(my_device) = a.contract(b, dot_product_dims); + + +#### Evaluating On GPU + +This is presently a bit more complicated than just using a thread pool device. +You need to create a GPU device but you also need to explicitly allocate the +memory for tensors with cuda. + + +## API Reference + +### Datatypes + +In the documentation of the tensor methods and Operation we mention datatypes +that are tensor-type specific: + +#### `::``Dimensions` + +Acts like an array of ints. Has an `int size` attribute, and can be +indexed like an array to access individual values. Used to represent the +dimensions of a tensor. See `dimensions()`. + +#### `::``Index` + +Acts like an `int`. Used for indexing tensors along their dimensions. See +`operator()`, `dimension()`, and `size()`. + +#### `::``Scalar` + +Represents the datatype of individual tensor elements. For example, for a +`Tensor`, `Scalar` is the type `float`. See +`setConstant()`. + +#### `` + +We use this pseudo type to indicate that a tensor Operation is returned by a +method. We indicate in the text the type and dimensions of the tensor that the +Operation returns after evaluation. + +The Operation will have to be evaluated, for example by assigning it to a +tensor, before you can access the values of the resulting tensor. You can also +access the values through a TensorRef. + + +## Built-in Tensor Methods + +These are usual C++ methods that act on tensors immediately. They are not +Operations which provide delayed evaluation of their results. Unless specified +otherwise, all the methods listed below are available on all tensor classes: +Tensor, TensorFixedSize, and TensorMap. + +## Metadata + +### `int NumDimensions` + +Constant value indicating the number of dimensions of a Tensor. This is also +known as the tensor "rank". + + Eigen::Tensor a(3, 4); + cout << "Dims " << a.NumDimensions; + => Dims 2 + +### `Dimensions dimensions()` + +Returns an array-like object representing the dimensions of the tensor. +The actual type of the `dimensions()` result is `::``Dimensions`. + + Eigen::Tensor a(3, 4); + const Eigen::Tensor::Dimensions& d = a.dimensions(); + cout << "Dim size: " << d.size << ", dim 0: " << d[0] + << ", dim 1: " << d[1]; + => Dim size: 2, dim 0: 3, dim 1: 4 + +If you use a C++11 compiler, you can use `auto` to simplify the code: + + const auto& d = a.dimensions(); + cout << "Dim size: " << d.size << ", dim 0: " << d[0] + << ", dim 1: " << d[1]; + => Dim size: 2, dim 0: 3, dim 1: 4 + +### `Index dimension(Index n)` + +Returns the n-th dimension of the tensor. The actual type of the +`dimension()` result is `::``Index`, but you can +always use it like an int. + + Eigen::Tensor a(3, 4); + int dim1 = a.dimension(1); + cout << "Dim 1: " << dim1; + => Dim 1: 4 + +### `Index size()` + +Returns the total number of elements in the tensor. This is the product of all +the tensor dimensions. The actual type of the `size()` result is +`::``Index`, but you can always use it like an int. + + Eigen::Tensor a(3, 4); + cout << "Size: " << a.size(); + => Size: 12 + + +### Getting Dimensions From An Operation + +A few operations provide `dimensions()` directly, +e.g. `TensorReslicingOp`. Most operations defer calculating dimensions +until the operation is being evaluated. If you need access to the dimensions +of a deferred operation, you can wrap it in a TensorRef (see Assigning to a +TensorRef above), which provides `dimensions()` and `dimension()` as +above. + +TensorRef can also wrap the plain Tensor types, so this is a useful idiom in +templated contexts where the underlying object could be either a raw Tensor +or some deferred operation (e.g. a slice of a Tensor). In this case, the +template code can wrap the object in a TensorRef and reason about its +dimensionality while remaining agnostic to the underlying type. + + +## Constructors + +### Tensor + +Creates a tensor of the specified size. The number of arguments must be equal +to the rank of the tensor. The content of the tensor is not initialized. + + Eigen::Tensor a(3, 4); + cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; + => NumRows: 3 NumCols: 4 + +### TensorFixedSize + +Creates a tensor of the specified size. The number of arguments in the Sizes<> +template parameter determines the rank of the tensor. The content of the tensor +is not initialized. + + Eigen::TensorFixedSize> a; + cout << "Rank: " << a.rank() << endl; + => Rank: 2 + cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; + => NumRows: 3 NumCols: 4 + +### TensorMap + +Creates a tensor mapping an existing array of data. The data must not be freed +until the TensorMap is discarded, and the size of the data must be large enough +to accommodate the coefficients of the tensor. + + float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + Eigen::TensorMap> a(data, 3, 4); + cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; + => NumRows: 3 NumCols: 4 + cout << "a(1, 2): " << a(1, 2) << endl; + => a(1, 2): 7 + + +## Contents Initialization + +When a new Tensor or a new TensorFixedSize are created, memory is allocated to +hold all the tensor elements, but the memory is not initialized. Similarly, +when a new TensorMap is created on top of non-initialized memory the memory its +contents are not initialized. + +You can use one of the methods below to initialize the tensor memory. These +have an immediate effect on the tensor and return the tensor itself as a +result. These are not tensor Operations which delay evaluation. + +### ` setConstant(const Scalar& val)` + +Sets all elements of the tensor to the constant value `val`. `Scalar` +is the type of data stored in the tensor. You can pass any value that is +convertible to that type. + +Returns the tensor itself in case you want to chain another call. + + a.setConstant(12.3f); + cout << "Constant: " << endl << a << endl << endl; + => + Constant: + 12.3 12.3 12.3 12.3 + 12.3 12.3 12.3 12.3 + 12.3 12.3 12.3 12.3 + +Note that `setConstant()` can be used on any tensor where the element type +has a copy constructor and an `operator=()`: + + Eigen::Tensor a(2, 3); + a.setConstant("yolo"); + cout << "String tensor: " << endl << a << endl << endl; + => + String tensor: + yolo yolo yolo + yolo yolo yolo + + +### ` setZero()` + +Fills the tensor with zeros. Equivalent to `setConstant(Scalar(0))`. +Returns the tensor itself in case you want to chain another call. + + a.setZero(); + cout << "Zeros: " << endl << a << endl << endl; + => + Zeros: + 0 0 0 0 + 0 0 0 0 + 0 0 0 0 + + +### ` setValues({..initializer_list})` + +Fills the tensor with explicit values specified in a std::initializer_list. +The type of the initializer list depends on the type and rank of the tensor. + +If the tensor has rank N, the initializer list must be nested N times. The +most deeply nested lists must contains P scalars of the Tensor type where P is +the size of the last dimension of the Tensor. + +For example, for a `TensorFixedSize` the initializer list must +contains 2 lists of 3 floats each. + +`setValues()` returns the tensor itself in case you want to chain another +call. + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}}); + cout << "a" << endl << a << endl << endl; + => + a + 0 1 2 + 3 4 5 + +If a list is too short, the corresponding elements of the tensor will not be +changed. This is valid at each level of nesting. For example the following +code only sets the values of the first row of the tensor. + + Eigen::Tensor a(2, 3); + a.setConstant(1000); + a.setValues({{10, 20, 30}}); + cout << "a" << endl << a << endl << endl; + => + a + 10 20 30 + 1000 1000 1000 + +### ` setRandom()` + +Fills the tensor with random values. Returns the tensor itself in case you +want to chain another call. + + a.setRandom(); + cout << "Random: " << endl << a << endl << endl; + => + Random: + 0.680375 0.59688 -0.329554 0.10794 + -0.211234 0.823295 0.536459 -0.0452059 + 0.566198 -0.604897 -0.444451 0.257742 + +You can customize `setRandom()` by providing your own random number +generator as a template argument: + + a.setRandom(); + +Here, `MyRandomGenerator` must be a struct with the following member +functions, where Scalar and Index are the same as `::``Scalar` +and `::``Index`. + +See `struct UniformRandomGenerator` in TensorFunctors.h for an example. + + // Custom number generator for use with setRandom(). + struct MyRandomGenerator { + // Default and copy constructors. Both are needed + MyRandomGenerator() { } + MyRandomGenerator(const MyRandomGenerator& ) { } + + // Return a random value to be used. "element_location" is the + // location of the entry to set in the tensor, it can typically + // be ignored. + Scalar operator()(Eigen::DenseIndex element_location, + Eigen::DenseIndex /*unused*/ = 0) const { + return ; + } + + // Same as above but generates several numbers at a time. + typename internal::packet_traits::type packetOp( + Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const { + return ; + } + }; + +You can also use one of the 2 random number generators that are part of the +tensor library: +* UniformRandomGenerator +* NormalRandomGenerator + + +## Data Access + +The Tensor, TensorFixedSize, and TensorRef classes provide the following +accessors to access the tensor coefficients: + + const Scalar& operator()(const array& indices) + const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + Scalar& operator()(const array& indices) + Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + +The number of indices must be equal to the rank of the tensor. Moreover, these +accessors are not available on tensor expressions. In order to access the +values of a tensor expression, the expression must either be evaluated or +wrapped in a TensorRef. + + +### `Scalar* data()` and `const Scalar* data() const` + +Returns a pointer to the storage for the tensor. The pointer is const if the +tensor was const. This allows direct access to the data. The layout of the +data depends on the tensor layout: RowMajor or ColMajor. + +This access is usually only needed for special cases, for example when mixing +Eigen Tensor code with other libraries. + +Scalar is the type of data stored in the tensor. + + Eigen::Tensor a(3, 4); + float* a_data = a.data(); + a_data[0] = 123.45f; + cout << "a(0, 0): " << a(0, 0); + => a(0, 0): 123.45 + + +## Tensor Operations + +All the methods documented below return non evaluated tensor `Operations`. +These can be chained: you can apply another Tensor Operation to the value +returned by the method. + +The chain of Operation is evaluated lazily, typically when it is assigned to a +tensor. See "Controlling when Expression are Evaluated" for more details about +their evaluation. + +### ` constant(const Scalar& val)` + +Returns a tensor of the same type and dimensions as the original tensor but +where all elements have the value `val`. + +This is useful, for example, when you want to add or subtract a constant from a +tensor, or multiply every element of a tensor by a scalar. + + Eigen::Tensor a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor b = a + a.constant(2.0f); + Eigen::Tensor c = b * b.constant(0.2f); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + cout << "c" << endl << c << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + 3 3 3 + 3 3 3 + + c + 0.6 0.6 0.6 + 0.6 0.6 0.6 + +### ` random()` + +Returns a tensor of the same type and dimensions as the current tensor +but where all elements have random values. + +This is for example useful to add random values to an existing tensor. +The generation of random values can be customized in the same manner +as for `setRandom()`. + + Eigen::Tensor a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor b = a + a.random(); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + 1.68038 1.5662 1.82329 + 0.788766 1.59688 0.395103 + + +## Unary Element Wise Operations + +All these operations take a single input tensor as argument and return a tensor +of the same type and dimensions as the tensor to which they are applied. The +requested operations are applied to each element independently. + +### ` operator-()` + +Returns a tensor of the same type and dimensions as the original tensor +containing the opposite values of the original tensor. + + Eigen::Tensor a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor b = -a; + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + -1 -1 -1 + -1 -1 -1 + +### ` sqrt()` + +Returns a tensor of the same type and dimensions as the original tensor +containing the square roots of the original tensor. + +### ` rsqrt()` + +Returns a tensor of the same type and dimensions as the original tensor +containing the inverse square roots of the original tensor. + +### ` square()` + +Returns a tensor of the same type and dimensions as the original tensor +containing the squares of the original tensor values. + +### ` inverse()` + +Returns a tensor of the same type and dimensions as the original tensor +containing the inverse of the original tensor values. + +### ` exp()` + +Returns a tensor of the same type and dimensions as the original tensor +containing the exponential of the original tensor. + +### ` log()` + +Returns a tensor of the same type and dimensions as the original tensor +containing the natural logarithms of the original tensor. + +### ` abs()` + +Returns a tensor of the same type and dimensions as the original tensor +containing the absolute values of the original tensor. + +### ` pow(Scalar exponent)` + +Returns a tensor of the same type and dimensions as the original tensor +containing the coefficients of the original tensor to the power of the +exponent. + +The type of the exponent, Scalar, is always the same as the type of the +tensor coefficients. For example, only integer exponents can be used in +conjuntion with tensors of integer values. + +You can use cast() to lift this restriction. For example this computes +cubic roots of an int Tensor: + + Eigen::Tensor a(2, 3); + a.setValues({{0, 1, 8}, {27, 64, 125}}); + Eigen::Tensor b = a.cast().pow(1.0 / 3.0); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 0 1 8 + 27 64 125 + + b + 0 1 2 + 3 4 5 + +### ` operator * (Scalar scale)` + +Multiplies all the coefficients of the input tensor by the provided scale. + +### ` cwiseMax(Scalar threshold)` +TODO + +### ` cwiseMin(Scalar threshold)` +TODO + +### ` unaryExpr(const CustomUnaryOp& func)` +TODO + + +## Binary Element Wise Operations + +These operations take two input tensors as arguments. The 2 input tensors should +be of the same type and dimensions. The result is a tensor of the same +dimensions as the tensors to which they are applied, and unless otherwise +specified it is also of the same type. The requested operations are applied to +each pair of elements independently. + +### ` operator+(const OtherDerived& other)` + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise sums of the inputs. + +### ` operator-(const OtherDerived& other)` + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise differences of the inputs. + +### ` operator*(const OtherDerived& other)` + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise products of the inputs. + +### ` operator/(const OtherDerived& other)` + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise quotients of the inputs. + +This operator is not supported for integer types. + +### ` cwiseMax(const OtherDerived& other)` + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise maximums of the inputs. + +### ` cwiseMin(const OtherDerived& other)` + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise mimimums of the inputs. + +### ` Logical operators` + +The following logical operators are supported as well: + +* operator&&(const OtherDerived& other) +* operator||(const OtherDerived& other) +* operator<(const OtherDerived& other) +* operator<=(const OtherDerived& other) +* operator>(const OtherDerived& other) +* operator>=(const OtherDerived& other) +* operator==(const OtherDerived& other) +* operator!=(const OtherDerived& other) + +They all return a tensor of boolean values. + + +## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) + +Selection is a coefficient-wise ternary operator that is the tensor equivalent +to the if-then-else operation. + + Tensor if = ...; + Tensor then = ...; + Tensor else = ...; + Tensor result = if.select(then, else); + +The 3 arguments must be of the same dimensions, which will also be the dimension +of the result. The 'if' tensor must be of type boolean, the 'then' and the +'else' tensor must be of the same type, which will also be the type of the +result. + +Each coefficient in the result is equal to the corresponding coefficient in the +'then' tensor if the corresponding value in the 'if' tensor is true. If not, the +resulting coefficient will come from the 'else' tensor. + + +## Contraction + +Tensor *contractions* are a generalization of the matrix product to the +multidimensional case. + + // Create 2 matrices using tensors of rank 2 + Eigen::Tensor a(2, 3); + a.setValues({{1, 2, 3}, {6, 5, 4}}); + Eigen::Tensor b(3, 2); + b.setValues({{1, 2}, {4, 5}, {5, 6}}); + + // Compute the traditional matrix product + Eigen::array, 1> product_dims = { Eigen::IndexPair(1, 0) }; + Eigen::Tensor AB = a.contract(b, product_dims); + + // Compute the product of the transpose of the matrices + Eigen::array, 1> transposed_product_dims = { Eigen::IndexPair(0, 1) }; + Eigen::Tensor AtBt = a.contract(b, transposed_product_dims); + + // Contraction to scalar value using a double contraction. + // First coordinate of both tensors are contracted as well as both second coordinates, i.e., this computes the sum of the squares of the elements. + Eigen::array, 2> double_contraction_product_dims = { Eigen::IndexPair(0, 0), Eigen::IndexPair(1, 1) }; + Eigen::Tensor AdoubleContractedA = a.contract(a, double_contraction_product_dims); + + // Extracting the scalar value of the tensor contraction for further usage + int value = AdoubleContractedA(0); + +## Reduction Operations + +A *Reduction* operation returns a tensor with fewer dimensions than the +original tensor. The values in the returned tensor are computed by applying a +*reduction operator* to slices of values from the original tensor. You specify +the dimensions along which the slices are made. + +The Eigen Tensor library provides a set of predefined reduction operators such +as `maximum()` and `sum()` and lets you define additional operators by +implementing a few methods from a reductor template. + +### Reduction Dimensions + +All reduction operations take a single parameter of type +`::``Dimensions` which can always be specified as an array of +ints. These are called the "reduction dimensions." The values are the indices +of the dimensions of the input tensor over which the reduction is done. The +parameter can have at most as many element as the rank of the input tensor; +each element must be less than the tensor rank, as it indicates one of the +dimensions to reduce. + +Each dimension of the input tensor should occur at most once in the reduction +dimensions as the implementation does not remove duplicates. + +The order of the values in the reduction dimensions does not affect the +results, but the code may execute faster if you list the dimensions in +increasing order. + +Example: Reduction along one dimension. + + // Create a tensor of 2 dimensions + Eigen::Tensor a(2, 3); + a.setValues({{1, 2, 3}, {6, 5, 4}}); + // Reduce it along the second dimension (1)... + Eigen::array dims({1 /* dimension to reduce */}); + // ...using the "maximum" operator. + // The result is a tensor with one dimension. The size of + // that dimension is the same as the first (non-reduced) dimension of a. + Eigen::Tensor b = a.maximum(dims); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 2 3 + 6 5 4 + + b + 3 + 6 + +Example: Reduction along two dimensions. + + Eigen::Tensor a(2, 3, 4); + a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, + {7.0f, 6.0f, 5.0f, 4.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}, + {{12.0f, 13.0f, 14.0f, 15.0f}, + {19.0f, 18.0f, 17.0f, 16.0f}, + {20.0f, 21.0f, 22.0f, 23.0f}}}); + // The tensor a has 3 dimensions. We reduce along the + // first 2, resulting in a tensor with a single dimension + // of size 4 (the last dimension of a.) + // Note that we pass the array of reduction dimensions + // directly to the maximum() call. + Eigen::Tensor b = + a.maximum(Eigen::array({0, 1})); + cout << "b" << endl << b << endl << endl; + => + b + 20 + 21 + 22 + 23 + +#### Reduction along all dimensions + +As a special case, if you pass no parameter to a reduction operation the +original tensor is reduced along *all* its dimensions. The result is a +scalar, represented as a zero-dimension tensor. + + Eigen::Tensor a(2, 3, 4); + a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, + {7.0f, 6.0f, 5.0f, 4.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}, + {{12.0f, 13.0f, 14.0f, 15.0f}, + {19.0f, 18.0f, 17.0f, 16.0f}, + {20.0f, 21.0f, 22.0f, 23.0f}}}); + // Reduce along all dimensions using the sum() operator. + Eigen::Tensor b = a.sum(); + cout << "b" << endl << b << endl << endl; + => + b + 276 + + +### ` sum(const Dimensions& new_dims)` +### ` sum()` + +Reduce a tensor using the sum() operator. The resulting values +are the sum of the reduced values. + +### ` mean(const Dimensions& new_dims)` +### ` mean()` + +Reduce a tensor using the mean() operator. The resulting values +are the mean of the reduced values. + +### ` maximum(const Dimensions& new_dims)` +### ` maximum()` + +Reduce a tensor using the maximum() operator. The resulting values are the +largest of the reduced values. + +### ` minimum(const Dimensions& new_dims)` +### ` minimum()` + +Reduce a tensor using the minimum() operator. The resulting values +are the smallest of the reduced values. + +### ` prod(const Dimensions& new_dims)` +### ` prod()` + +Reduce a tensor using the prod() operator. The resulting values +are the product of the reduced values. + +### ` all(const Dimensions& new_dims)` +### ` all()` +Reduce a tensor using the all() operator. Casts tensor to bool and then checks +whether all elements are true. Runs through all elements rather than +short-circuiting, so may be significantly inefficient. + +### ` any(const Dimensions& new_dims)` +### ` any()` +Reduce a tensor using the any() operator. Casts tensor to bool and then checks +whether any element is true. Runs through all elements rather than +short-circuiting, so may be significantly inefficient. + + +### ` reduce(const Dimensions& new_dims, const Reducer& reducer)` + +Reduce a tensor using a user-defined reduction operator. See `SumReducer` +in TensorFunctors.h for information on how to implement a reduction operator. + + +## Scan Operations + +A *Scan* operation returns a tensor with the same dimensions as the original +tensor. The operation performs an inclusive scan along the specified +axis, which means it computes a running total along the axis for a given +reduction operation. +If the reduction operation corresponds to summation, then this computes the +prefix sum of the tensor along the given axis. + +Example: +dd a comment to this line + + // Create a tensor of 2 dimensions + Eigen::Tensor a(2, 3); + a.setValues({{1, 2, 3}, {4, 5, 6}}); + // Scan it along the second dimension (1) using summation + Eigen::Tensor b = a.cumsum(1); + // The result is a tensor with the same size as the input + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 2 3 + 4 5 6 + + b + 1 3 6 + 4 9 15 + +### ` cumsum(const Index& axis)` + +Perform a scan by summing consecutive entries. + +### ` cumprod(const Index& axis)` + +Perform a scan by multiplying consecutive entries. + + +## Convolutions + +### ` convolve(const Kernel& kernel, const Dimensions& dims)` + +Returns a tensor that is the output of the convolution of the input tensor with the kernel, +along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor +which were part of the convolution will be reduced by the formula: +output_dim_size = input_dim_size - kernel_dim_size + 1 (requires: input_dim_size >= kernel_dim_size). +The dimension sizes for dimensions that were not part of the convolution will remain the same. +Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the +convolution is computed (the first dimension has the shortest stride for ColMajor, whereas RowMajor's shortest stride is +for the last dimension). + + // Compute convolution along the second and third dimension. + Tensor input(3, 3, 7, 11); + Tensor kernel(2, 2); + Tensor output(3, 2, 6, 11); + input.setRandom(); + kernel.setRandom(); + + Eigen::array dims({1, 2}); // Specify second and third dimension for convolution. + output = input.convolve(kernel, dims); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 6; ++k) { + for (int l = 0; l < 11; ++l) { + const float result = output(i,j,k,l); + const float expected = input(i,j+0,k+0,l) * kernel(0,0) + + input(i,j+1,k+0,l) * kernel(1,0) + + input(i,j+0,k+1,l) * kernel(0,1) + + input(i,j+1,k+1,l) * kernel(1,1); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + + +## Geometrical Operations + +These operations return a Tensor with different dimensions than the original +Tensor. They can be used to access slices of tensors, see them with different +dimensions, or pad tensors with additional data. + +### ` reshape(const Dimensions& new_dims)` + +Returns a view of the input tensor that has been reshaped to the specified +new dimensions. The argument new_dims is an array of Index values. The +rank of the resulting tensor is equal to the number of elements in new_dims. + +The product of all the sizes in the new dimension array must be equal to +the number of elements in the input tensor. + + // Increase the rank of the input tensor by introducing a new dimension + // of size 1. + Tensor input(7, 11); + array three_dims{{7, 11, 1}}; + Tensor result = input.reshape(three_dims); + + // Decrease the rank of the input tensor by merging 2 dimensions; + array one_dim{{7 * 11}}; + Tensor result = input.reshape(one_dim); + +This operation does not move any data in the input tensor, so the resulting +contents of a reshaped Tensor depend on the data layout of the original Tensor. + +For example this is what happens when you `reshape()` a 2D ColMajor tensor +to one dimension: + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array one_dim({3 * 2}); + Eigen::Tensor b = a.reshape(one_dim); + cout << "b" << endl << b << endl; + => + b + 0 + 300 + 100 + 400 + 200 + 500 + +This is what happens when the 2D Tensor is RowMajor: + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array one_dim({3 * 2}); + Eigen::Tensor b = a.reshape(one_dim); + cout << "b" << endl << b << endl; + => + b + 0 + 100 + 200 + 300 + 400 + 500 + +The reshape operation is a lvalue. In other words, it can be used on the left +side of the assignment operator. + +The previous example can be rewritten as follow: + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array two_dim({2, 3}); + Eigen::Tensor b(6); + b.reshape(two_dim) = a; + cout << "b" << endl << b << endl; + => + b + 0 + 300 + 100 + 400 + 200 + 500 + +Note that "b" itself was not reshaped but that instead the assignment is done to +the reshape view of b. + + +### ` shuffle(const Shuffle& shuffle)` + +Returns a copy of the input tensor whose dimensions have been +reordered according to the specified permutation. The argument shuffle +is an array of Index values. Its size is the rank of the input +tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th +dimension of the output tensor equals to the size of the shuffle[i]-th +dimension of the input tensor. For example: + + // Shuffle all dimensions to the left by 1. + Tensor input(20, 30, 50); + // ... set some values in input. + Tensor output = input.shuffle({1, 2, 0}) + + eigen_assert(output.dimension(0) == 30); + eigen_assert(output.dimension(1) == 50); + eigen_assert(output.dimension(2) == 20); + +Indices into the output tensor are shuffled accordingly to formulate +indices into the input tensor. For example, one can assert in the above +code snippet that: + + eigen_assert(output(3, 7, 11) == input(11, 3, 7)); + +In general, one can assert that + + eigen_assert(output(..., indices[shuffle[i]], ...) == + input(..., indices[i], ...)) + +The shuffle operation results in a lvalue, which means that it can be assigned +to. In other words, it can be used on the left side of the assignment operator. + +Let's rewrite the previous example to take advantage of this feature: + + // Shuffle all dimensions to the left by 1. + Tensor input(20, 30, 50); + // ... set some values in input. + Tensor output(30, 50, 20); + output.shuffle({2, 0, 1}) = input; + + +### ` stride(const Strides& strides)` + +Returns a view of the input tensor that strides (skips stride-1 +elements) along each of the dimensions. The argument strides is an +array of Index values. The dimensions of the resulting tensor are +ceil(input_dimensions[i] / strides[i]). + +For example this is what happens when you `stride()` a 2D tensor: + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array strides({3, 2}); + Eigen::Tensor b = a.stride(strides); + cout << "b" << endl << b << endl; + => + b + 0 200 + 900 1100 + +It is possible to assign a tensor to a stride: + Tensor input(20, 30, 50); + // ... set some values in input. + Tensor output(40, 90, 200); + output.stride({2, 3, 4}) = input; + + +### ` slice(const StartIndices& offsets, const Sizes& extents)` + +Returns a sub-tensor of the given tensor. For each dimension i, the slice is +made of the coefficients stored between offset[i] and offset[i] + extents[i] in +the input tensor. + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array offsets = {1, 0}; + Eigen::array extents = {2, 2}; + Eigen::Tensor slice = a.slice(offsets, extents); + cout << "a" << endl << a << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + cout << "slice" << endl << slice << endl; + => + slice + 300 400 + 600 700 + + +### ` chip(const Index offset, const Index dim)` + +A chip is a special kind of slice. It is the subtensor at the given offset in +the dimension dim. The returned tensor has one fewer dimension than the input +tensor: the dimension dim is removed. + +For example, a matrix chip would be either a row or a column of the input +matrix. + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::Tensor row_3 = a.chip(2, 0); + Eigen::Tensor col_2 = a.chip(1, 1); + cout << "a" << endl << a << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + cout << "row_3" << endl << row_3 << endl; + => + row_3 + 600 700 800 + cout << "col_2" << endl << col_2 << endl; + => + col_2 + 100 400 700 1000 + +It is possible to assign values to a tensor chip since the chip operation is a +lvalue. For example: + + Eigen::Tensor a(3); + a.setValues({{100, 200, 300}}); + Eigen::Tensor b(2, 3); + b.setZero(); + b.chip(0, 0) = a; + cout << "a" << endl << a << endl; + => + a + 100 + 200 + 300 + cout << "b" << endl << b << endl; + => + b + 100 200 300 + 0 0 0 + + +### ` reverse(const ReverseDimensions& reverse)` + +Returns a view of the input tensor that reverses the order of the coefficients +along a subset of the dimensions. The argument reverse is an array of boolean +values that indicates whether or not the order of the coefficients should be +reversed along each of the dimensions. This operation preserves the dimensions +of the input tensor. + +For example this is what happens when you `reverse()` the first dimension +of a 2D tensor: + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array reverse({true, false}); + Eigen::Tensor b = a.reverse(reverse); + cout << "a" << endl << a << endl << "b" << endl << b << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + b + 900 1000 1100 + 600 700 800 + 300 400 500 + 0 100 200 + + +### ` broadcast(const Broadcast& broadcast)` + +Returns a view of the input tensor in which the input is replicated one to many +times. +The broadcast argument specifies how many copies of the input tensor need to be +made in each of the dimensions. + + Eigen::Tensor a(2, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}}); + Eigen::array bcast({3, 2}); + Eigen::Tensor b = a.broadcast(bcast); + cout << "a" << endl << a << endl << "b" << endl << b << endl; + => + a + 0 100 200 + 300 400 500 + b + 0 100 200 0 100 200 + 300 400 500 300 400 500 + 0 100 200 0 100 200 + 300 400 500 300 400 500 + 0 100 200 0 100 200 + 300 400 500 300 400 500 + +### ` concatenate(const OtherDerived& other, Axis axis)` + +TODO + +### ` pad(const PaddingDimensions& padding)` + +Returns a view of the input tensor in which the input is padded with zeros. + + Eigen::Tensor a(2, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}}); + Eigen::array, 2> paddings; + paddings[0] = make_pair(0, 1); + paddings[1] = make_pair(2, 3); + Eigen::Tensor b = a.pad(paddings); + cout << "a" << endl << a << endl << "b" << endl << b << endl; + => + a + 0 100 200 + 300 400 500 + b + 0 0 0 0 + 0 0 0 0 + 0 100 200 0 + 300 400 500 0 + 0 0 0 0 + 0 0 0 0 + 0 0 0 0 + + +### ` extract_patches(const PatchDims& patch_dims)` + +Returns a tensor of coefficient patches extracted from the input tensor, where +each patch is of dimension specified by 'patch_dims'. The returned tensor has +one greater dimension than the input tensor, which is used to index each patch. +The patch index in the output tensor depends on the data layout of the input +tensor: the patch index is the last dimension ColMajor layout, and the first +dimension in RowMajor layout. + +For example, given the following input tensor: + + Eigen::Tensor tensor(3,4); + tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f}, + {4.0f, 5.0f, 6.0f, 7.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}); + + cout << "tensor: " << endl << tensor << endl; +=> +tensor: + 0 1 2 3 + 4 5 6 7 + 8 9 10 11 + +Six 2x2 patches can be extracted and indexed using the following code: + + Eigen::Tensor patch; + Eigen::array patch_dims; + patch_dims[0] = 2; + patch_dims[1] = 2; + patch = tensor.extract_patches(patch_dims); + for (int k = 0; k < 6; ++k) { + cout << "patch index: " << k << endl; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + if (DataLayout == ColMajor) { + cout << patch(i, j, k) << " "; + } else { + cout << patch(k, i, j) << " "; + } + } + cout << endl; + } + } + +This code results in the following output when the data layout is ColMajor: + +patch index: 0 +0 1 +4 5 +patch index: 1 +4 5 +8 9 +patch index: 2 +1 2 +5 6 +patch index: 3 +5 6 +9 10 +patch index: 4 +2 3 +6 7 +patch index: 5 +6 7 +10 11 + +This code results in the following output when the data layout is RowMajor: +(NOTE: the set of patches is the same as in ColMajor, but are indexed differently). + +patch index: 0 +0 1 +4 5 +patch index: 1 +1 2 +5 6 +patch index: 2 +2 3 +6 7 +patch index: 3 +4 5 +8 9 +patch index: 4 +5 6 +9 10 +patch index: 5 +6 7 +10 11 + +### ` extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)` + +Returns a tensor of coefficient image patches extracted from the input tensor, +which is expected to have dimensions ordered as follows (depending on the data +layout of the input tensor, and the number of additional dimensions 'N'): + +*) ColMajor +1st dimension: channels (of size d) +2nd dimension: rows (of size r) +3rd dimension: columns (of size c) +4th-Nth dimension: time (for video) or batch (for bulk processing). + +*) RowMajor (reverse order of ColMajor) +1st-Nth dimension: time (for video) or batch (for bulk processing). +N+1'th dimension: columns (of size c) +N+2'th dimension: rows (of size r) +N+3'th dimension: channels (of size d) + +The returned tensor has one greater dimension than the input tensor, which is +used to index each patch. The patch index in the output tensor depends on the +data layout of the input tensor: the patch index is the 4'th dimension in +ColMajor layout, and the 4'th from the last dimension in RowMajor layout. + +For example, given the following input tensor with the following dimension +sizes: + *) depth: 2 + *) rows: 3 + *) columns: 5 + *) batch: 7 + + Tensor tensor(2,3,5,7); + Tensor tensor_row_major = tensor.swap_layout(); + +2x2 image patches can be extracted and indexed using the following code: + +*) 2D patch: ColMajor (patch indexed by second-to-last dimension) + Tensor twod_patch; + twod_patch = tensor.extract_image_patches<2, 2>(); + // twod_patch.dimension(0) == 2 + // twod_patch.dimension(1) == 2 + // twod_patch.dimension(2) == 2 + // twod_patch.dimension(3) == 3*5 + // twod_patch.dimension(4) == 7 + +*) 2D patch: RowMajor (patch indexed by the second dimension) + Tensor twod_patch_row_major; + twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>(); + // twod_patch_row_major.dimension(0) == 7 + // twod_patch_row_major.dimension(1) == 3*5 + // twod_patch_row_major.dimension(2) == 2 + // twod_patch_row_major.dimension(3) == 2 + // twod_patch_row_major.dimension(4) == 2 + +## Special Operations + +### ` cast()` + +Returns a tensor of type T with the same dimensions as the original tensor. +The returned tensor contains the values of the original tensor converted to +type T. + + Eigen::Tensor a(2, 3); + Eigen::Tensor b = a.cast(); + +This can be useful for example if you need to do element-wise division of +Tensors of integers. This is not currently supported by the Tensor library +but you can easily cast the tensors to floats to do the division: + + Eigen::Tensor a(2, 3); + a.setValues({{0, 1, 2}, {3, 4, 5}}); + Eigen::Tensor b = + (a.cast() / a.constant(2).cast()).cast(); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 0 1 2 + 3 4 5 + + b + 0 0 1 + 1 2 2 + + +### ` eval()` + +TODO + + +## Representation of scalar values + +Scalar values are often represented by tensors of size 1 and rank 0.For example +Tensor::maximum() currently returns a Tensor. Similarly, the inner +product of 2 1d tensors (through contractions) returns a 0d tensor. + +## Limitations + +* The number of tensor dimensions is currently limited to 250 when using a + compiler that supports cxx11. It is limited to only 5 for older compilers. +* The IndexList class requires a cxx11 compliant compiler. You can use an + array of indices instead if you don't have access to a modern compiler. +* On GPUs only floating point values are properly tested and optimized for. +* Complex and integer values are known to be broken on GPUs. If you try to use + them you'll most likely end up triggering a static assertion failure such as + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + + diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/Tensor.h new file mode 100644 index 00000000..e44d51ec --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -0,0 +1,527 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_H + +namespace Eigen { + +/** \class Tensor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor class. + * + * The %Tensor class is the work-horse for all \em dense tensors within Eigen. + * + * The %Tensor class encompasses only dynamic-size objects so far. + * + * The first two template parameters are required: + * \tparam Scalar_ Numeric type, e.g. float, double, int or `std::complex`. + * User defined scalar types are supported as well (see \ref user_defined_scalars "here"). + * \tparam NumIndices_ Number of indices (i.e. rank of the tensor) + * + * The remaining template parameters are optional -- in most cases you don't have to worry about them. + * \tparam Options_ A combination of either \b #RowMajor or \b #ColMajor, and of either + * \b #AutoAlign or \b #DontAlign. + * The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required + * for vectorization. It defaults to aligning tensors. Note that tensors currently do not support any operations that profit from vectorization. + * Support for such operations (i.e. adding two tensors etc.) is planned. + * + * You can access elements of tensors using normal subscripting: + * + * \code + * Eigen::Tensor t(10, 10, 10, 10); + * t(0, 1, 2, 3) = 42.0; + * \endcode + * + * This class can be extended with the help of the plugin mechanism described on the page + * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN. + * + * Some notes: + * + *
+ *
Relation to other parts of Eigen:
+ *
The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that + * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code + * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor + * class does not provide any of these features and is only available as a stand-alone class that just allows for + * coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to + * change dramatically.
+ *
+ * + * \ref TopicStorageOrders + */ + +template +class Tensor : public TensorBase > +{ + public: + typedef Tensor Self; + typedef TensorBase > Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef Scalar_ Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + enum { + IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign), + Layout = Options_ & RowMajor ? RowMajor : ColMajor, + CoordAccess = true, + RawAccess = true + }; + + static const int Options = Options_; + static const int NumIndices = NumIndices_; + typedef DSizes Dimensions; + + protected: + TensorStorage m_storage; + +#ifdef EIGEN_HAS_SFINAE + template + struct isOfNormalIndex{ + static const bool is_array = internal::is_base_of, CustomIndices>::value; + static const bool is_int = NumTraits::IsInteger; + static const bool value = is_array | is_int; + }; +#endif + + public: + // Metadata + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + + // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // work, because that uses base().coeffRef() - and we don't yet + // implement a similar class hierarchy + inline Self& base() { return *this; } + inline const Self& base() const { return *this; } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array{{firstIndex, secondIndex, otherIndices...}}); + } +#endif + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + // custom indices +#ifdef EIGEN_HAS_SFINAE + template::value) ) + > + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const + { + return coeff(internal::customIndices2Array(indices)); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array{{firstIndex, secondIndex, otherIndices...}}); + } +#endif + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + // custom indices +#ifdef EIGEN_HAS_SFINAE + template::value) ) + > + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices) + { + return coeffRef(internal::customIndices2Array(indices)); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array{{firstIndex, secondIndex, otherIndices...}}); + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + { + return coeff(array(i0, i1)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + { + return coeff(array(i0, i1, i2)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + { + return coeff(array(i0, i1, i2, i3)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + return coeff(array(i0, i1, i2, i3, i4)); + } +#endif + + // custom indices +#ifdef EIGEN_HAS_SFINAE + template::value) ) + > + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const + { + return coeff(internal::customIndices2Array(indices)); + } +#endif + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const + { + return coeff(indices); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return coeff(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const + { + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(index); + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array{{firstIndex, secondIndex, otherIndices...}}); + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + { + return coeffRef(array(i0, i1)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + { + return coeffRef(array(i0, i1, i2)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + return coeffRef(array(i0, i1, i2, i3)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + return coeffRef(array(i0, i1, i2, i3, i4)); + } +#endif + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + { + return coeffRef(indices); + } + + // custom indices +#ifdef EIGEN_HAS_SFINAE + template::value) ) + > + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices) + { + return coeffRef(internal::customIndices2Array(indices)); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeffRef(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) + { + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor() + : m_storage() + { + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const Self& other) + : m_storage(other.m_storage) + { + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions) + : m_storage(firstDimension, otherDimensions...) + { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#else + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1) + : m_storage(dim1, array(dim1)) + { + EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2) + : m_storage(dim1*dim2, array(dim1, dim2)) + { + EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3) + : m_storage(dim1*dim2*dim3, array(dim1, dim2, dim3)) + { + EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4) + : m_storage(dim1*dim2*dim3*dim4, array(dim1, dim2, dim3, dim4)) + { + EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) + : m_storage(dim1*dim2*dim3*dim4*dim5, array(dim1, dim2, dim3, dim4, dim5)) + { + EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#endif + + /** Normal Dimension */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array& dimensions) + : m_storage(internal::array_prod(dimensions), dimensions) + { + EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + void resize(Index firstDimension, IndexTypes... otherDimensions) + { + // The number of dimensions used to resize a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + resize(array{{firstDimension, otherDimensions...}}); + } +#endif + + /** Normal Dimension */ + EIGEN_DEVICE_FUNC void resize(const array& dimensions) + { + int i; + Index size = Index(1); + for (i = 0; i < NumIndices; i++) { + internal::check_rows_cols_for_overflow::run(size, dimensions[i]); + size *= dimensions[i]; + } + #ifdef EIGEN_INITIALIZE_COEFFS + bool size_changed = size != this->size(); + m_storage.resize(size, dimensions); + if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + #else + m_storage.resize(size, dimensions); + #endif + } + + // Why this overload, DSizes is derived from array ??? // + EIGEN_DEVICE_FUNC void resize(const DSizes& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = dimensions[i]; + } + resize(dims); + } + + EIGEN_DEVICE_FUNC + void resize() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + // Nothing to do: rank 0 tensors have fixed size + } + + /** Custom Dimension */ +#ifdef EIGEN_HAS_SFINAE + template::value) ) + > + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions) + { + resize(internal::customIndices2Array(dimensions)); + } +#endif + +#ifndef EIGEN_EMULATE_CXX11_META_H + template + EIGEN_DEVICE_FUNC + void resize(const Sizes& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = static_cast(dimensions[i]); + } + resize(dims); + } +#else + template + EIGEN_DEVICE_FUNC + void resize(const Sizes& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = static_cast(dimensions[i]); + } + resize(dims); + } +#endif + + protected: + + bool checkIndexRange(const array& indices) const + { + using internal::array_apply_and_reduce; + using internal::array_zip_and_reduce; + using internal::greater_equal_zero_op; + using internal::logical_and_op; + using internal::lesser_op; + + return + // check whether the indices are all >= 0 + array_apply_and_reduce(indices) && + // check whether the indices fit in the dimensions + array_zip_and_reduce(indices, m_storage.dimensions()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const + { + if (Options&RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h new file mode 100644 index 00000000..7da160fd --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -0,0 +1,299 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Eugene Brevdo +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H +#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H + +namespace Eigen { +namespace internal { + +/** \class TensorIndexTuple + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor + Index Tuple class. + * + * + */ +template +struct traits > : public traits +{ + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef Tuple Scalar; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorIndexTupleOp& type; +}; + +template +struct nested, 1, + typename eval >::type> +{ + typedef TensorIndexTupleOp type; +}; + +} // end namespace internal + +template +class TensorIndexTupleOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + typedef Tuple CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexTupleOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorIndexTupleOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + typedef typename TensorEvaluator::Dimensions Dimensions; + static const int NumDims = internal::array_size::value; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = /*TensorEvaluator::PacketAccess*/ false, + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return CoeffReturnType(index, m_impl.coeff(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + TensorEvaluator m_impl; +}; + +namespace internal { + +/** \class TensorTupleIndex + * \ingroup CXX11_Tensor_Module + * + * \brief Converts to Tensor > and reduces to Tensor. + * + */ +template +struct traits > : public traits +{ + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef Index Scalar; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions - array_size::value; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorTupleReducerOp& type; +}; + +template +struct nested, 1, + typename eval >::type> +{ + typedef TensorTupleReducerOp type; +}; + +} // end namespace internal + +template +class TensorTupleReducerOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + typedef Index CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr, + const ReduceOp& reduce_op, + const int return_dim, + const Dims& reduce_dims) + : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + const ReduceOp& reduce_op() const { return m_reduce_op; } + + EIGEN_DEVICE_FUNC + const Dims& reduce_dims() const { return m_reduce_dims; } + + EIGEN_DEVICE_FUNC + int return_dim() const { return m_return_dim; } + + protected: + typename XprType::Nested m_xpr; + const ReduceOp m_reduce_op; + const int m_return_dim; + const Dims m_reduce_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorTupleReducerOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename TensorIndexTupleOp::CoeffReturnType TupleType; + typedef typename TensorEvaluator >, Device>::Dimensions Dimensions; + typedef typename TensorEvaluator , Device>::Dimensions InputDimensions; + static const int NumDims = internal::array_size::value; + typedef array StrideDims; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = /*TensorEvaluator::PacketAccess*/ false, + BlockAccess = false, + Layout = TensorEvaluator >, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_orig_impl(op.expression(), device), + m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), + m_return_dim(op.return_dim()) { + + gen_strides(m_orig_impl.dimensions(), m_strides); + if (Layout == static_cast(ColMajor)) { + const Index total_size = internal::array_prod(m_orig_impl.dimensions()); + m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size; + } else { + const Index total_size = internal::array_prod(m_orig_impl.dimensions()); + m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size; + } + m_stride_div = m_strides[m_return_dim]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + const TupleType v = m_impl.coeff(index); + return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double compute_cost = 1.0 + + (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost() + TensorOpCost::DivCost())); + return m_orig_impl.costPerCoeff(vectorized) + + m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost); + } + + private: + EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) { + if (m_return_dim < 0) { + return; // Won't be using the strides. + } + eigen_assert(m_return_dim < NumDims && + "Asking to convert index to a dimension outside of the rank"); + + // Calculate m_stride_div and m_stride_mod, which are used to + // calculate the value of an index w.r.t. the m_return_dim. + if (Layout == static_cast(ColMajor)) { + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i-1] * dims[i-1]; + } + } else { + strides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i+1] * dims[i+1]; + } + } + } + + protected: + TensorEvaluator, Device> m_orig_impl; + TensorEvaluator >, Device> m_impl; + const int m_return_dim; + StrideDims m_strides; + Index m_stride_mod; + Index m_stride_div; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h new file mode 100644 index 00000000..e13ba738 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -0,0 +1,181 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H +#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H + +namespace Eigen { + +/** \class TensorAssign + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor assignment class. + * + * This class is represents the assignment of the values resulting from the evaluation of + * the rhs expression to the memory locations denoted by the lhs expression. + */ +namespace internal { +template +struct traits > +{ + typedef typename LhsXprType::Scalar Scalar; + typedef typename traits::StorageKind StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const std::size_t NumDimensions = internal::traits::NumDimensions; + static const int Layout = internal::traits::Layout; + + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorAssignOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorAssignOp type; +}; + +} // end namespace internal + + + +template +class TensorAssignOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename LhsXprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + typename internal::remove_all::type& + lhsExpression() const { return *((typename internal::remove_all::type*)&m_lhs_xpr); } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename internal::remove_all::type& m_lhs_xpr; + const typename internal::remove_all::type& m_rhs_xpr; +}; + + +template +struct TensorEvaluator, Device> +{ + typedef TensorAssignOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename TensorEvaluator::Dimensions Dimensions; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + RawAccess = TensorEvaluator::RawAccess + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + } + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // The dimensions of the lhs and the rhs tensors should be equal to prevent + // overflows and ensure the result is fully initialized. + // TODO: use left impl instead if right impl dimensions are known at compile time. + return m_rightImpl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + m_leftImpl.evalSubExprsIfNeeded(NULL); + // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non + // null value), attempt to evaluate the rhs expression in place. Returns true iff in place + // evaluation isn't supported and the caller still needs to manually assign the values generated + // by the rhs to the lhs. + return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); + } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_leftImpl.coeff(index); + } + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + return m_leftImpl.template packet(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here, but reduce left + // cost by one load because we are using m_leftImpl.coeffRef. + TensorOpCost left = m_leftImpl.costPerCoeff(vectorized); + return m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost( + numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)), + left.bytes_stored(), left.compute_cycles()) + + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + + /// required by sycl in order to extract the accessor + const TensorEvaluator& left_impl() const { return m_leftImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& right_impl() const { return m_rightImpl; } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } + + private: + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; +}; + +} + + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h new file mode 100644 index 00000000..49c07959 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -0,0 +1,1012 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H +#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H + +// clang-format off + +namespace Eigen { + +/** \class TensorBase + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor base class. + * + * This class is the common parent of the Tensor and TensorMap class, thus + * making it possible to use either class interchangably in expressions. + */ +#ifndef EIGEN_PARSED_BY_DOXYGEN +// FIXME Doxygen does not like the inheritance with different template parameters +// Since there is no doxygen documentation inside, we disable it for now +template +class TensorBase +{ + public: + typedef internal::traits DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; + typedef typename internal::remove_const::type CoeffReturnType; + static const int NumDimensions = DerivedTraits::NumDimensions; + + // Generic nullary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp + nullaryExpr(const CustomNullaryOp& func) const { + return TensorCwiseNullaryOp(derived(), func); + } + + // Coefficient-wise nullary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> + constant(const Scalar& value) const { + return nullaryExpr(internal::scalar_constant_op(value)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> + random() const { + return nullaryExpr(internal::UniformRandomGenerator()); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp + random(const RandomGenerator& gen = RandomGenerator()) const { + return nullaryExpr(gen); + } + + // Tensor generation + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorGeneratorOp + generate(const Generator& generator) const { + return TensorGeneratorOp(derived(), generator); + } + + // Generic unary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp + unaryExpr(const CustomUnaryOp& func) const { + return TensorCwiseUnaryOp(derived(), func); + } + + // Coefficient-wise unary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator-() const { + return unaryExpr(internal::scalar_opposite_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + sqrt() const { + return unaryExpr(internal::scalar_sqrt_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + sign() const { + return unaryExpr(internal::scalar_sign_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + rsqrt() const { + return unaryExpr(internal::scalar_rsqrt_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + square() const { + return unaryExpr(internal::scalar_square_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + cube() const { + return unaryExpr(internal::scalar_cube_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + inverse() const { + return unaryExpr(internal::scalar_inverse_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + tanh() const { + return unaryExpr(internal::scalar_tanh_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + lgamma() const { + return unaryExpr(internal::scalar_lgamma_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + digamma() const { + return unaryExpr(internal::scalar_digamma_op()); + } + + // igamma(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igamma(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igamma_op()); + } + + // igammac(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igammac(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igammac_op()); + } + + // zeta(x = this, q = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + zeta(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_zeta_op()); + } + + // polygamma(n = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + polygamma(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_polygamma_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + erf() const { + return unaryExpr(internal::scalar_erf_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + erfc() const { + return unaryExpr(internal::scalar_erfc_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + sigmoid() const { + return unaryExpr(internal::scalar_sigmoid_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + exp() const { + return unaryExpr(internal::scalar_exp_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log() const { + return unaryExpr(internal::scalar_log_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log1p() const { + return unaryExpr(internal::scalar_log1p_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + abs() const { + return unaryExpr(internal::scalar_abs_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + conjugate() const { + return unaryExpr(internal::scalar_conjugate_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + pow(Scalar exponent) const { + return unaryExpr(internal::bind2nd_op >(exponent)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + real() const { + return unaryExpr(internal::scalar_real_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + imag() const { + return unaryExpr(internal::scalar_imag_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator+ (Scalar rhs) const { + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator+ (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator- (Scalar rhs) const { + EIGEN_STATIC_ASSERT((NumTraits::IsSigned || internal::is_same >::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator- (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator* (Scalar rhs) const { + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator* (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator/ (Scalar rhs) const { + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator/ (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator% (Scalar rhs) const { + EIGEN_STATIC_ASSERT(NumTraits::IsInteger, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD); + return unaryExpr(internal::scalar_mod_op(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + cwiseMax(Scalar threshold) const { + return cwiseMax(constant(threshold)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + cwiseMin(Scalar threshold) const { + return cwiseMin(constant(threshold)); + } + + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorConversionOp + cast() const { + return TensorConversionOp(derived()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + round() const { + return unaryExpr(internal::scalar_round_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + ceil() const { + return unaryExpr(internal::scalar_ceil_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + floor() const { + return unaryExpr(internal::scalar_floor_op()); + } + + // Generic binary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp + binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const { + return TensorCwiseBinaryOp(derived(), other, func); + } + + // Coefficient-wise binary operators. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator+(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_sum_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator-(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_difference_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator*(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_product_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator/(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_quotient_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + cwiseMax(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_max_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + cwiseMin(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_min_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp + operator&&(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_and_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp + operator||(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_or_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp + operator^(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_xor_op()); + } + + // Comparisons and tests. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator<(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator<=(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator>(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator>=(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator==(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator!=(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + + // comparisons and tests for Scalars + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator<(Scalar threshold) const { + return operator<(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator<=(Scalar threshold) const { + return operator<=(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator>(Scalar threshold) const { + return operator>(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator>=(Scalar threshold) const { + return operator>=(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator==(Scalar threshold) const { + return operator==(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator!=(Scalar threshold) const { + return operator!=(constant(threshold)); + } + + // Checks + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + (isnan)() const { + return unaryExpr(internal::scalar_isnan_op()); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + (isinf)() const { + return unaryExpr(internal::scalar_isinf_op()); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + (isfinite)() const { + return unaryExpr(internal::scalar_isfinite_op()); + } + + // Coefficient-wise ternary operators. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSelectOp + select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { + return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); + } + + // Contractions. + typedef Eigen::IndexPair DimensionPair; + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorContractionOp + contract(const OtherDerived& other, const Dimensions& dims) const { + return TensorContractionOp(derived(), other.derived(), dims); + } + + // Convolutions. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConvolutionOp + convolve(const KernelDerived& kernel, const Dimensions& dims) const { + return TensorConvolutionOp(derived(), kernel.derived(), dims); + } + + // Fourier transforms + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorFFTOp + fft(const FFT& fft) const { + return TensorFFTOp(derived(), fft); + } + + // Scan. + typedef TensorScanOp, const Derived> TensorScanSumOp; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanSumOp + cumsum(const Index& axis, bool exclusive = false) const { + return TensorScanSumOp(derived(), axis, exclusive); + } + + typedef TensorScanOp, const Derived> TensorScanProdOp; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanProdOp + cumprod(const Index& axis, bool exclusive = false) const { + return TensorScanProdOp(derived(), axis, exclusive); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanOp + scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const { + return TensorScanOp(derived(), axis, exclusive, reducer); + } + + // Reductions. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + sum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::SumReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + sum() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::SumReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + mean(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MeanReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + mean() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MeanReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + prod(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::ProdReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + prod() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::ProdReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + maximum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + maximum() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MaxReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + minimum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + minimum() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MinReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp > + all(const Dims& dims) const { + return cast().reduce(dims, internal::AndReducer()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const TensorConversionOp > + all() const { + DimensionList in_dims; + return cast().reduce(in_dims, internal::AndReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp > + any(const Dims& dims) const { + return cast().reduce(dims, internal::OrReducer()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const TensorConversionOp > + any() const { + DimensionList in_dims; + return cast().reduce(in_dims, internal::OrReducer()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTupleReducerOp< + internal::ArgMaxTupleReducer >, + const array, const Derived> + argmax() const { + array in_dims; + for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + return TensorTupleReducerOp< + internal::ArgMaxTupleReducer >, + const array, + const Derived>(derived(), internal::ArgMaxTupleReducer >(), -1, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTupleReducerOp< + internal::ArgMinTupleReducer >, + const array, const Derived> + argmin() const { + array in_dims; + for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + return TensorTupleReducerOp< + internal::ArgMinTupleReducer >, + const array, + const Derived>(derived(), internal::ArgMinTupleReducer >(), -1, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTupleReducerOp< + internal::ArgMaxTupleReducer >, + const array, const Derived> + argmax(const int return_dim) const { + array in_dims; + in_dims[0] = return_dim; + return TensorTupleReducerOp< + internal::ArgMaxTupleReducer >, + const array, + const Derived>(derived(), internal::ArgMaxTupleReducer >(), return_dim, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTupleReducerOp< + internal::ArgMinTupleReducer >, + const array, const Derived> + argmin(const int return_dim) const { + array in_dims; + in_dims[0] = return_dim; + return TensorTupleReducerOp< + internal::ArgMinTupleReducer >, + const array, + const Derived>(derived(), internal::ArgMinTupleReducer >(), return_dim, in_dims); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp + reduce(const Dims& dims, const Reducer& reducer) const { + return TensorReductionOp(derived(), dims, reducer); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorBroadcastingOp + broadcast(const Broadcast& broadcast) const { + return TensorBroadcastingOp(derived(), broadcast); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConcatenationOp + concatenate(const OtherDerived& other, Axis axis) const { + return TensorConcatenationOp(derived(), other.derived(), axis); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPatchOp + extract_patches(const PatchDims& patch_dims) const { + return TensorPatchOp(derived(), patch_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const Index patch_rows = 1, const Index patch_cols = 1, + const Index row_stride = 1, const Index col_stride = 1, + const Index in_row_stride = 1, const Index in_col_stride = 1, + const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { + return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, + in_row_stride, in_col_stride, 1, 1, padding_type, padding_value); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const Index in_row_stride, const Index in_col_stride, + const Index row_inflate_stride, const Index col_inflate_stride, + const Index padding_top, const Index padding_bottom, + const Index padding_left,const Index padding_right, + const Scalar padding_value) const { + return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, + in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride, + padding_top, padding_bottom, padding_left, padding_right, padding_value); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorVolumePatchOp + extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, + const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1, + const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { + return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value); + } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorVolumePatchOp + extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, + const Index plane_stride, const Index row_stride, const Index col_stride, + const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride, + const Index padding_top_z, const Index padding_bottom_z, + const Index padding_top, const Index padding_bottom, + const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const { + return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value); + } + + // Morphing operators. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorLayoutSwapOp + swap_layout() const { + return TensorLayoutSwapOp(derived()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp(derived(), newDimensions); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp(derived(), startIndices, sizes); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingSlicingOp + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { + return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset) const { + return TensorChippingOp(derived(), offset, DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset, const Index dim) const { + return TensorChippingOp(derived(), offset, dim); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReverseOp + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp(derived(), rev); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPaddingOp + pad(const PaddingDimensions& padding) const { + return TensorPaddingOp(derived(), padding, internal::scalar_cast_op()(0)); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPaddingOp + pad(const PaddingDimensions& padding, const Scalar padding_value) const { + return TensorPaddingOp(derived(), padding, padding_value); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorShufflingOp + shuffle(const Shuffle& shuffle) const { + return TensorShufflingOp(derived(), shuffle); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingOp + stride(const Strides& strides) const { + return TensorStridingOp(derived(), strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorInflationOp + inflate(const Strides& strides) const { + return TensorInflationOp(derived(), strides); + } + + // Returns a tensor containing index/value tuples + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorIndexTupleOp + index_tuples() const { + return TensorIndexTupleOp(derived()); + } + + // Support for custom unary and binary operations + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCustomUnaryOp customOp(const CustomUnaryFunc& op) const { + return TensorCustomUnaryOp(derived(), op); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCustomBinaryOp customOp(const OtherDerived& other, const CustomBinaryFunc& op) const { + return TensorCustomBinaryOp(derived(), other, op); + } + + // Force the evaluation of the expression. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorForcedEvalOp eval() const { + return TensorForcedEvalOp(derived()); + } + + protected: + template friend class Tensor; + template friend class TensorFixedSize; + template friend class TensorBase; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } +}; + +template::value> +class TensorBase : public TensorBase { + public: + typedef internal::traits DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; + typedef Scalar CoeffReturnType; + static const int NumDimensions = DerivedTraits::NumDimensions; + + template friend class Tensor; + template friend class TensorFixedSize; + template friend class TensorBase; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setZero() { + return setConstant(Scalar(0)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { + return derived() = this->constant(val); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->random(); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->template random(); + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setValues( + const typename internal::Initializer::InitList& vals) { + TensorEvaluator eval(derived(), DefaultDevice()); + internal::initialize_tensor(eval, vals); + return derived(); + } +#endif // EIGEN_HAS_VARIADIC_TEMPLATES + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator+=(const OtherDerived& other) { + return derived() = derived() + other.derived(); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator-=(const OtherDerived& other) { + return derived() = derived() - other.derived(); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator*=(const OtherDerived& other) { + return derived() = derived() * other.derived(); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator/=(const OtherDerived& other) { + return derived() = derived() / other.derived(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorLayoutSwapOp + swap_layout() const { + return TensorLayoutSwapOp(derived()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorLayoutSwapOp + swap_layout() { + return TensorLayoutSwapOp(derived()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConcatenationOp + concatenate(const OtherDerived& other, const Axis& axis) const { + return TensorConcatenationOp(derived(), other, axis); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorConcatenationOp + concatenate(const OtherDerived& other, const Axis& axis) { + return TensorConcatenationOp(derived(), other, axis); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp(derived(), newDimensions); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReshapingOp + reshape(const NewDimensions& newDimensions) { + return TensorReshapingOp(derived(), newDimensions); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp(derived(), startIndices, sizes); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) { + return TensorSlicingOp(derived(), startIndices, sizes); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingSlicingOp + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { + return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingSlicingOp + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) { + return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset) const { + return TensorChippingOp(derived(), offset, DimId); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp + chip(const Index offset) { + return TensorChippingOp(derived(), offset, DimId); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset, const Index dim) const { + return TensorChippingOp(derived(), offset, dim); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp + chip(const Index offset, const Index dim) { + return TensorChippingOp(derived(), offset, dim); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReverseOp + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp(derived(), rev); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReverseOp + reverse(const ReverseDimensions& rev) { + return TensorReverseOp(derived(), rev); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorShufflingOp + shuffle(const Shuffle& shuffle) const { + return TensorShufflingOp(derived(), shuffle); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorShufflingOp + shuffle(const Shuffle& shuffle) { + return TensorShufflingOp(derived(), shuffle); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingOp + stride(const Strides& strides) const { + return TensorStridingOp(derived(), strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingOp + stride(const Strides& strides) { + return TensorStridingOp(derived(), strides); + } + + // Select the device on which to evaluate the expression. + template + TensorDevice device(const DeviceType& device) { + return TensorDevice(device, derived()); + } + + protected: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& derived() { return *static_cast(this); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } +}; +#endif // EIGEN_PARSED_BY_DOXYGEN +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h new file mode 100644 index 00000000..260dcfe6 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -0,0 +1,392 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H +#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H + +namespace Eigen { + +/** \class TensorBroadcasting + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor broadcasting class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorBroadcastingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorBroadcastingOp type; +}; + +template +struct is_input_scalar { + static const bool value = false; +}; +template <> +struct is_input_scalar > { + static const bool value = true; +}; +#ifndef EIGEN_EMULATE_CXX11_META_H +template +struct is_input_scalar > { + static const bool value = (Sizes::total_size == 1); +}; +#endif + +} // end namespace internal + + + +template +class TensorBroadcastingOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast) + : m_xpr(expr), m_broadcast(broadcast) {} + + EIGEN_DEVICE_FUNC + const Broadcast& broadcast() const { return m_broadcast; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Broadcast m_broadcast; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorBroadcastingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions InputDimensions; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = true, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_broadcast(op.broadcast()),m_impl(op.expression(), device) + { + // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar + // and store the result in a scalar. Instead one should reshape the scalar into a a N-D + // tensor with N >= 1 of 1 element first and then broadcast. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + const InputDimensions& input_dims = m_impl.dimensions(); + const Broadcast& broadcast = op.broadcast(); + for (int i = 0; i < NumDims; ++i) { + eigen_assert(input_dims[i] > 0); + m_dimensions[i] = input_dims[i] * broadcast[i]; + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } else { + m_inputStrides[NumDims-1] = 1; + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims-2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const + { + if (internal::is_input_scalar::type>::value) { + return m_impl.coeff(0); + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + return coeffColMajor(index); + } else { + return coeffRowMajor(index); + } + } + + // TODO: attempt to speed this up. The integer divisions and modulo are slow + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + inputIndex += index; + } else { + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index % m_impl.dimensions()[0] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[0]); + } + } + return m_impl.coeff(inputIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + { + Index inputIndex = 0; + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq(NumDims-1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims-1]); + inputIndex += index; + } else { + if (internal::index_statically_eq(NumDims-1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[NumDims-1]); + } + } + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const + { + if (internal::is_input_scalar::type>::value) { + return internal::pset1(m_impl.coeff(0)); + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + return packetColMajor(index); + } else { + return packetRowMajor(index); + } + } + + // Ignore the LoadMode and always use unaligned loads since we can't guarantee + // the alignment at compile time. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + Index innermostLoc; + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + innermostLoc = index; + } else { + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index % m_impl.dimensions()[0] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[0]; + } + } + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + values[0] = m_impl.coeff(inputIndex); + for (int i = 1; i < PacketSize; ++i) { + values[i] = coeffColMajor(originalIndex+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + Index innermostLoc; + if (internal::index_statically_eq(NumDims-1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims-1]); + innermostLoc = index; + } else { + if (internal::index_statically_eq(NumDims-1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[NumDims-1]; + } + } + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + values[0] = m_impl.coeff(inputIndex); + for (int i = 1; i < PacketSize; ++i) { + values[i] = coeffRowMajor(originalIndex+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + double compute_cost = TensorOpCost::AddCost(); + if (NumDims > 0) { + for (int i = NumDims - 1; i > 0; --i) { + compute_cost += TensorOpCost::DivCost(); + if (internal::index_statically_eq(i, 1)) { + compute_cost += + TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } else { + if (!internal::index_statically_eq(i, 1)) { + compute_cost += TensorOpCost::MulCost() + + TensorOpCost::ModCost() + + TensorOpCost::AddCost(); + } + } + compute_cost += + TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } + } + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + const TensorEvaluator& impl() const { return m_impl; } + + Broadcast functor() const { return m_broadcast; } + + protected: + const Broadcast m_broadcast; + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h new file mode 100644 index 00000000..e81b0718 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -0,0 +1,384 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H +#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H + +namespace Eigen { + +/** \class TensorKChippingReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor. + * + * + */ + +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions - 1; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorChippingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorChippingOp type; +}; + +template +struct DimensionId +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { + eigen_assert(dim == DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { + return DimId; + } +}; +template <> +struct DimensionId +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) { + eigen_assert(dim >= 0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { + return actual_dim; + } + private: + const DenseIndex actual_dim; +}; + + +} // end namespace internal + + + +template +class TensorChippingOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) + : m_xpr(expr), m_offset(offset), m_dim(dim) { + } + + EIGEN_DEVICE_FUNC + const Index offset() const { return m_offset; } + EIGEN_DEVICE_FUNC + const Index dim() const { return m_dim.actualDim(); } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const Index m_offset; + const internal::DimensionId m_dim; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorChippingOp XprType; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumDims = NumInputDims-1; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets. + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) + { + EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(NumInputDims > m_dim.actualDim()); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + eigen_assert(op.offset() < input_dims[m_dim.actualDim()]); + + int j = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (i != m_dim.actualDim()) { + m_dimensions[j] = input_dims[i]; + ++j; + } + } + + m_stride = 1; + m_inputStride = 1; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < m_dim.actualDim(); ++i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } else { + for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } + m_inputStride *= input_dims[m_dim.actualDim()]; + m_inputOffset = m_stride * op.offset(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + Index inputIndex = index * m_inputStride + m_inputOffset; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = m_impl.coeff(inputIndex); + inputIndex += m_inputStride; + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(m_stride > index); + return m_impl.template packet(index + m_inputOffset); + } else { + const Index idx = index / m_stride; + const Index rem = index - idx * m_stride; + if (rem + PacketSize <= m_stride) { + Index inputIndex = idx * m_inputStride + m_inputOffset + rem; + return m_impl.template packet(inputIndex); + } else { + // Cross the stride boundary. Fallback to slow path. + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index); + ++index; + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + double cost = 0; + if ((static_cast(Layout) == static_cast(ColMajor) && + m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && + m_dim.actualDim() == NumInputDims - 1)) { + cost += TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } else if ((static_cast(Layout) == static_cast(ColMajor) && + m_dim.actualDim() == NumInputDims - 1) || + (static_cast(Layout) == static_cast(RowMajor) && + m_dim.actualDim() == 0)) { + cost += TensorOpCost::AddCost(); + } else { + cost += 3 * TensorOpCost::MulCost() + TensorOpCost::DivCost() + + 3 * TensorOpCost::AddCost(); + } + + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { + CoeffReturnType* result = const_cast(m_impl.data()); + if (((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumDims) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) && + result) { + return result + m_inputOffset; + } else { + return NULL; + } + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex; + if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + inputIndex = index * m_inputStride + m_inputOffset; + } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims-1) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(m_stride > index); + inputIndex = index + m_inputOffset; + } else { + const Index idx = index / m_stride; + inputIndex = idx * m_inputStride + m_inputOffset; + index -= idx * m_stride; + inputIndex += index; + } + return inputIndex; + } + + Dimensions m_dimensions; + Index m_stride; + Index m_inputOffset; + Index m_inputStride; + TensorEvaluator m_impl; + const internal::DimensionId m_dim; + const Device& m_device; +}; + + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorChippingOp XprType; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumDims = NumInputDims-1; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + + if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == 0) || + (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(this->m_stride == 1); + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + internal::pstore(values, x); + Index inputIndex = index * this->m_inputStride + this->m_inputOffset; + for (int i = 0; i < PacketSize; ++i) { + this->m_impl.coeffRef(inputIndex) = values[i]; + inputIndex += this->m_inputStride; + } + } else if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) || + (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(this->m_stride > index); + this->m_impl.template writePacket(index + this->m_inputOffset, x); + } else { + const Index idx = index / this->m_stride; + const Index rem = index - idx * this->m_stride; + if (rem + PacketSize <= this->m_stride) { + const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem; + this->m_impl.template writePacket(inputIndex, x); + } else { + // Cross stride boundary. Fallback to slow path. + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + internal::pstore(values, x); + for (int i = 0; i < PacketSize; ++i) { + this->coeffRef(index) = values[i]; + ++index; + } + } + } + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h new file mode 100644 index 00000000..ecbacc28 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -0,0 +1,361 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H + +namespace Eigen { + +/** \class TensorConcatenationOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor concatenation class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type::ret Scalar; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorConcatenationOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorConcatenationOp type; +}; + +} // end namespace internal + + +template +class TensorConcatenationOp : public TensorBase, WriteAccessors> +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::nested::type Nested; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename NumTraits::Real RealScalar; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const TensorConcatenationOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Axis m_axis; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorConcatenationOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + static const int RightNumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + + eigen_assert(0 <= m_axis && m_axis < NumDims); + const Dimensions& lhs_dims = m_leftImpl.dimensions(); + const Dimensions& rhs_dims = m_rightImpl.dimensions(); + { + int i = 0; + for (; i < m_axis; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + eigen_assert(lhs_dims[i] > 0); // Now i == m_axis. + eigen_assert(rhs_dims[i] > 0); + m_dimensions[i] = lhs_dims[i] + rhs_dims[i]; + for (++i; i < NumDims; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_leftStrides[0] = 1; + m_rightStrides[0] = 1; + m_outputStrides[0] = 1; + + for (int j = 1; j < NumDims; ++j) { + m_leftStrides[j] = m_leftStrides[j-1] * lhs_dims[j-1]; + m_rightStrides[j] = m_rightStrides[j-1] * rhs_dims[j-1]; + m_outputStrides[j] = m_outputStrides[j-1] * m_dimensions[j-1]; + } + } else { + m_leftStrides[NumDims - 1] = 1; + m_rightStrides[NumDims - 1] = 1; + m_outputStrides[NumDims - 1] = 1; + + for (int j = NumDims - 2; j >= 0; --j) { + m_leftStrides[j] = m_leftStrides[j+1] * lhs_dims[j+1]; + m_rightStrides[j] = m_rightStrides[j+1] * rhs_dims[j+1]; + m_outputStrides[j] = m_outputStrides[j+1] * m_dimensions[j+1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear? + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) + { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() + { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow. + // See CL/76180724 comments for more ideas. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Collect dimension-wise indices (subs). + array subs; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[NumDims - 1] = index; + } + + const Dimensions& left_dims = m_leftImpl.dimensions(); + if (subs[m_axis] < left_dims[m_axis]) { + Index left_index; + if (static_cast(Layout) == static_cast(ColMajor)) { + left_index = subs[0]; + for (int i = 1; i < NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + } else { + left_index = subs[NumDims - 1]; + for (int i = NumDims - 2; i >= 0; --i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + } + return m_leftImpl.coeff(left_index); + } else { + subs[m_axis] -= left_dims[m_axis]; + const Dimensions& right_dims = m_rightImpl.dimensions(); + Index right_index; + if (static_cast(Layout) == static_cast(ColMajor)) { + right_index = subs[0]; + for (int i = 1; i < NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + } else { + right_index = subs[NumDims - 1]; + for (int i = NumDims - 2; i >= 0; --i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + } + return m_rightImpl.coeff(right_index); + } + } + + // TODO(phli): Add a real vectorization. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost() + + TensorOpCost::ModCost()); + const double lhs_size = m_leftImpl.dimensions().TotalSize(); + const double rhs_size = m_rightImpl.dimensions().TotalSize(); + return (lhs_size / (lhs_size + rhs_size)) * + m_leftImpl.costPerCoeff(vectorized) + + (rhs_size / (lhs_size + rhs_size)) * + m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_leftStrides; + array m_rightStrides; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; + const Axis m_axis; +}; + +// Eval as lvalue +template + struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorConcatenationOp XprType; + typedef typename Base::Dimensions Dimensions; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) + : Base(op, device) + { + EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + // Collect dimension-wise indices (subs). + array subs; + for (int i = Base::NumDims - 1; i > 0; --i) { + subs[i] = index / this->m_outputStrides[i]; + index -= subs[i] * this->m_outputStrides[i]; + } + subs[0] = index; + + const Dimensions& left_dims = this->m_leftImpl.dimensions(); + if (subs[this->m_axis] < left_dims[this->m_axis]) { + Index left_index = subs[0]; + for (int i = 1; i < Base::NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i]; + } + return this->m_leftImpl.coeffRef(left_index); + } else { + subs[this->m_axis] -= left_dims[this->m_axis]; + const Dimensions& right_dims = this->m_rightImpl.dimensions(); + Index right_index = subs[0]; + for (int i = 1; i < Base::NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i]; + } + return this->m_rightImpl.coeffRef(right_index); + } + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize()); + + EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + internal::pstore(values, x); + for (int i = 0; i < packetSize; ++i) { + coeffRef(index+i) = values[i]; + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h new file mode 100644 index 00000000..06c1e8f6 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -0,0 +1,628 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H + +namespace Eigen { + +/** \class TensorContraction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor contraction class. + * + * + */ +namespace internal { + +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename gebp_traits::type, + typename remove_const::type>::ResScalar Scalar; + + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + + // From NumDims below. + static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; + static const int Layout = traits::Layout; + + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorContractionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorContractionOp type; +}; + +template +struct traits, Device_> > { + typedef Indices_ Indices; + typedef LeftArgType_ LeftArgType; + typedef RightArgType_ RightArgType; + typedef Device_ Device; + + // From NumDims below. + static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; +}; + +} // end namespace internal + +template +class TensorContractionOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename internal::gebp_traits::ResScalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp( + const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} + + EIGEN_DEVICE_FUNC + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Indices m_indices; +}; + + +template +struct TensorContractionEvaluatorBase +{ + typedef typename internal::traits::Indices Indices; + typedef typename internal::traits::LeftArgType LeftArgType; + typedef typename internal::traits::RightArgType RightArgType; + typedef typename internal::traits::Device Device; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + enum { + IsAligned = true, + PacketAccess = (internal::unpacket_traits::size > 1), + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = true + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + typedef DSizes Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorContractionEvaluatorBase(const XprType& op, const Device& device) + : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), + op.lhsExpression(), op.rhsExpression()), device), + m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), + op.rhsExpression(), op.lhsExpression()), device), + m_device(device), + m_result(NULL) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + + DSizes eval_left_dims; + DSizes eval_right_dims; + array, ContractDims> eval_op_indices; + if (static_cast(Layout) == static_cast(ColMajor)) { + // For ColMajor, we keep using the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[i]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[i]; + } + // We keep the pairs of contracting indices. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = op.indices()[i].first; + eval_op_indices[i].second = op.indices()[i].second; + } + } else { + // For RowMajor, we need to reverse the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1]; + } + // We need to flip all the pairs of contracting indices as well as + // reversing the dimensions. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second; + eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first; + } + } + + // Check for duplicate axes and make sure the first index in eval_op_indices + // is increasing. Using O(n^2) sorting is OK since ContractDims is small + for (int i = 0; i < ContractDims; i++) { + for (int j = i + 1; j < ContractDims; j++) { + eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first && + eval_op_indices[j].second != eval_op_indices[i].second && + "contraction axes should be unique"); + if (eval_op_indices[j].first < eval_op_indices[i].first) { + numext::swap(eval_op_indices[j], eval_op_indices[i]); + } + } + } + + array lhs_strides; + lhs_strides[0] = 1; + for (int i = 0; i < LDims-1; ++i) { + lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i]; + } + + array rhs_strides; + rhs_strides[0] = 1; + for (int i = 0; i < RDims-1; ++i) { + rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i]; + } + + if (m_i_strides.size() > 0) m_i_strides[0] = 1; + if (m_j_strides.size() > 0) m_j_strides[0] = 1; + if (m_k_strides.size() > 0) m_k_strides[0] = 1; + + m_i_size = 1; + m_j_size = 1; + m_k_size = 1; + + // To compute the dimension, we simply concatenate the non-contracting + // dimensions of the left and then the right tensor. Additionally, we also + // compute the strides corresponding to the left non-contracting + // dimensions and right non-contracting dimensions. + m_lhs_inner_dim_contiguous = true; + int dim_idx = 0; + unsigned int nocontract_idx = 0; + + for (int i = 0; i < LDims; i++) { + // find if we are contracting on index i of left tensor + bool contracting = false; + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].first == i) { + contracting = true; + break; + } + } + if (!contracting) { + // add dimension size to output dimensions + m_dimensions[dim_idx] = eval_left_dims[i]; + m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; + if (dim_idx != i) { + m_lhs_inner_dim_contiguous = false; + } + if (nocontract_idx+1 < internal::array_size::value) { + m_i_strides[nocontract_idx+1] = + m_i_strides[nocontract_idx] * eval_left_dims[i]; + } else { + m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i]; + } + dim_idx++; + nocontract_idx++; + } + } + + nocontract_idx = 0; + for (int i = 0; i < RDims; i++) { + bool contracting = false; + // find if we are contracting on index i of right tensor + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].second == i) { + contracting = true; + break; + } + } + if (!contracting) { + m_dimensions[dim_idx] = eval_right_dims[i]; + if (nocontract_idx+1 < internal::array_size::value) { + m_j_strides[nocontract_idx+1] = + m_j_strides[nocontract_idx] * eval_right_dims[i]; + } else { + m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i]; + } + m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; + dim_idx++; + nocontract_idx++; + } + } + + // Now compute the strides corresponding to the contracting dimensions. We + // assumed above that non-contracting axes are represented in the same order + // in the matrix as they are in the tensor. This is not the case for + // contracting axes. As the contracting axes must be of the same size in + // each tensor, we'll only look at the first tensor here. + m_rhs_inner_dim_contiguous = true; + m_rhs_inner_dim_reordered = false; + for (int i = 0; i < ContractDims; i++) { + Index left = eval_op_indices[i].first; + Index right = eval_op_indices[i].second; + + Index size = eval_left_dims[left]; + eigen_assert(size == eval_right_dims[right] && + "Contraction axes must be same size"); + + if (i+1 < static_cast(internal::array_size::value)) { + m_k_strides[i+1] = m_k_strides[i] * size; + } else { + m_k_size = m_k_strides[i] * size; + } + m_left_contracting_strides[i] = lhs_strides[left]; + m_right_contracting_strides[i] = rhs_strides[right]; + + if (i > 0 && right < eval_op_indices[i-1].second) { + m_rhs_inner_dim_reordered = true; + } + if (right != i) { + m_rhs_inner_dim_contiguous = false; + } + } + + // If the layout is RowMajor, we need to reverse the m_dimensions + if (static_cast(Layout) == static_cast(RowMajor)) { + for (int i = 0, j = NumDims - 1; i < j; i++, j--) { + numext::swap(m_dimensions[i], m_dimensions[j]); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalTo(m_result); + return true; + } + } + + EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalProduct(buffer); + } + else { + static_cast(this)->template evalProduct(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalProduct(buffer); + } + else { + static_cast(this)->template evalProduct(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalProduct(buffer); + } + else { + static_cast(this)->template evalProduct(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalProduct(buffer); + } + else { + static_cast(this)->template evalProduct(buffer); + } + } + } + } + + template + EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const { + const Index rows = m_i_size; + const Index cols = m_k_size; + + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + const Index lhs_packet_size = internal::unpacket_traits::size; + const Index rhs_packet_size = internal::unpacket_traits::size; + const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned; + const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned; + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, + m_left_contracting_strides, m_k_strides); + RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, + m_right_contracting_strides, m_k_strides); + + const Scalar alpha(1); + const Index resIncr(1); + + // zero out the result buffer (which must be of size at least rows * sizeof(Scalar) + m_device.memset(buffer, 0, rows * sizeof(Scalar)); + + internal::general_matrix_vector_product::run( + rows, cols, lhs, rhs, + buffer, resIncr, alpha); + } + + template + EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + // define mr, nr, and all of my data mapper types + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::gebp_traits Traits; + + const Index nr = Traits::nr; + const Index mr = Traits::mr; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + const Index lhs_packet_size = internal::unpacket_traits::size; + const Index rhs_packet_size = internal::unpacket_traits::size; + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + // Declare GEBP packing and kernel structs + internal::gemm_pack_lhs pack_lhs; + internal::gemm_pack_rhs pack_rhs; + + internal::gebp_kernel gebp; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + // Sizes of the blocks to load in cache. See the Goto paper for details. + internal::TensorContractionBlocking blocking(k, m, n, 1); + const Index kc = blocking.kc(); + const Index mc = numext::mini(m, blocking.mc()); + const Index nc = numext::mini(n, blocking.nc()); + const Index sizeA = mc * kc; + const Index sizeB = kc * nc; + + LhsScalar* blockA = static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar))); + RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); + + for(Index i2=0; i2m_device.deallocate(blockA); + this->m_device.deallocate(blockB); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + + if (m_result != NULL) { + m_device.deallocate(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_result[index]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } + + protected: + // Prevent assignment + TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); + Dimensions m_dimensions; + + contract_t m_k_strides; + contract_t m_left_contracting_strides; + contract_t m_right_contracting_strides; + + bool m_lhs_inner_dim_contiguous; + bool m_rhs_inner_dim_contiguous; + bool m_rhs_inner_dim_reordered; + + left_nocontract_t m_i_strides; + right_nocontract_t m_j_strides; + left_nocontract_t m_left_nocontract_strides; + right_nocontract_t m_right_nocontract_strides; + + Index m_i_size; + Index m_j_size; + Index m_k_size; + + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; + const Device& m_device; + Scalar* m_result; +}; + + +// evaluator for default device +template +struct TensorEvaluator, Device> : + public TensorContractionEvaluatorBase< + TensorEvaluator, Device> > { + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + enum { + Layout = TensorEvaluator::Layout + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + // Could we use NumDimensions here? + typedef DSizes Dimensions; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) { } + + template + EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv(buffer); + return; + } + + this->template evalGemm(buffer); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h new file mode 100644 index 00000000..0728841c --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -0,0 +1,56 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H + + +namespace Eigen { +namespace internal { + +enum { + ShardByRow = 0, + ShardByCol = 1 +}; + + +// Default Blocking Strategy +template +class TensorContractionBlocking { + public: + + typedef typename LhsMapper::Scalar LhsScalar; + typedef typename RhsMapper::Scalar RhsScalar; + + EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : + kc_(k), mc_(m), nc_(n) + { + if (ShardingType == ShardByCol) { + computeProductBlockingSizes(kc_, mc_, nc_, num_threads); + } + else { + computeProductBlockingSizes(kc_, nc_, mc_, num_threads); + } + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } + + private: + Index kc_; + Index mc_; + Index nc_; +}; + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h new file mode 100644 index 00000000..4fcc60b9 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -0,0 +1,1391 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014-2015 Benoit Steiner +// Copyright (C) 2015 Navdeep Jaitly +// Copyright (C) 2014 Eric Martin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + +namespace Eigen { + +template +__device__ EIGEN_STRONG_INLINE void +EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, + const Index m_size, const Index n_size, const Index k_size) { + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + // declare and initialize 64 registers for output 8x8 block + + // prefetch registers + Scalar lhs_pf0; + Scalar lhs_pf1; + Scalar lhs_pf2; + Scalar lhs_pf3; + Scalar lhs_pf4; + Scalar lhs_pf5; + Scalar lhs_pf6; + Scalar lhs_pf7; + + Scalar rhs_pf0; + Scalar rhs_pf1; + Scalar rhs_pf2; + Scalar rhs_pf3; + Scalar rhs_pf4; + Scalar rhs_pf5; + Scalar rhs_pf6; + Scalar rhs_pf7; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in threadIdx.x. + + // On the LHS, we pad each row inside of each block with an extra element. This makes + // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts + // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. + + // On the RHS we just add 8 padding elements to the end of each block. This gives no bank + // conflicts on writes and also none on reads. + + // storage indices + const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; + const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; + + const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; + const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; + const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; + const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; + const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; + const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; + const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; + + const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; + const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; + const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; + const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; + const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; + const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; + const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; + const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; + + // in the loading code, the following variables are important: + // threadIdx.x: the vertical position in an 8x8 block + // threadIdx.y: the vertical index of the 8x8 block in the grid + // threadIdx.z: the horizontal position in an 8x8 block + // k: the horizontal index of the 8x8 block in the grid + // + // The k parameter is implicit (it was the loop counter for a loop that went + // from 0 to <8, but now that loop is unrolled in the below code. + + const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; + const Index lhs_vert = base_m + load_idx_vert; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = conv(0); \ + lhs_pf1 = conv(0); \ + lhs_pf2 = conv(0); \ + lhs_pf3 = conv(0); \ + lhs_pf4 = conv(0); \ + lhs_pf5 = conv(0); \ + lhs_pf6 = conv(0); \ + lhs_pf7 = conv(0); \ + \ + rhs_pf0 = conv(0); \ + rhs_pf1 = conv(0); \ + rhs_pf2 = conv(0); \ + rhs_pf3 = conv(0); \ + rhs_pf4 = conv(0); \ + rhs_pf5 = conv(0); \ + rhs_pf6 = conv(0); \ + rhs_pf7 = conv(0); \ + \ + if (!needs_edge_check || lhs_vert < m_size) { \ + const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ + const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ + const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ + const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ + const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ + const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ + const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ + const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ + \ + if (!needs_edge_check || lhs_horiz_7 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ + } else if (lhs_horiz_6 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + } else if (lhs_horiz_5 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + } else if (lhs_horiz_4 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + } else if (lhs_horiz_3 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + } else if (lhs_horiz_2 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + } else if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + } \ + } \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (!needs_edge_check || rhs_vert < k_size) { \ + const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ + const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ + const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ + const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ + const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ + const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ + const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ + const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ + \ + if (rhs_horiz_7 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ + } else if (rhs_horiz_6 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + } else if (rhs_horiz_5 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + } else if (rhs_horiz_4 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + } else if (rhs_horiz_3 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + } else if (rhs_horiz_2 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + } else if (rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + } \ + } \ + } \ + +#define writeRegToShmem(_) \ + lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ + rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ + \ + lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ + rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ + \ + lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ + rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ + \ + lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ + rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ + \ + lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ + rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ + \ + lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ + rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ + \ + lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ + rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ + \ + lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ + rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = conv(0); \ + Scalar res(i, 1) = conv(0); \ + Scalar res(i, 2) = conv(0); \ + Scalar res(i, 3) = conv(0); \ + Scalar res(i, 4) = conv(0); \ + Scalar res(i, 5) = conv(0); \ + Scalar res(i, 6) = conv(0); \ + Scalar res(i, 7) = conv(0); \ + + internal::scalar_cast_op conv; + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + + #undef prefetchIntoRegisters + #undef writeRegToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + +#define lcol(i) _lcol##i + Scalar lcol(0); + Scalar lcol(1); + Scalar lcol(2); + Scalar lcol(3); + Scalar lcol(4); + Scalar lcol(5); + Scalar lcol(6); + Scalar lcol(7); + +#define rrow(j) _rrow##j + Scalar rrow(0); + Scalar rrow(1); + Scalar rrow(2); + Scalar rrow(3); + Scalar rrow(4); + Scalar rrow(5); + Scalar rrow(6); + Scalar rrow(7); + + // Now x corresponds to k, y to m, and z to n + const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; + const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; + +#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] +#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] + +#define loadData(i, j) \ + lcol(0) = lhs_element(0, j); \ + rrow(0) = rhs_element(i, 0); \ + lcol(1) = lhs_element(1, j); \ + rrow(1) = rhs_element(i, 1); \ + lcol(2) = lhs_element(2, j); \ + rrow(2) = rhs_element(i, 2); \ + lcol(3) = lhs_element(3, j); \ + rrow(3) = rhs_element(i, 3); \ + lcol(4) = lhs_element(4, j); \ + rrow(4) = rhs_element(i, 4); \ + lcol(5) = lhs_element(5, j); \ + rrow(5) = rhs_element(i, 5); \ + lcol(6) = lhs_element(6, j); \ + rrow(6) = rhs_element(i, 6); \ + lcol(7) = lhs_element(7, j); \ + rrow(7) = rhs_element(i, 7); \ + +#define computeCol(j) \ + res(0, j) += lcol(0) * rrow(j); \ + res(1, j) += lcol(1) * rrow(j); \ + res(2, j) += lcol(2) * rrow(j); \ + res(3, j) += lcol(3) * rrow(j); \ + res(4, j) += lcol(4) * rrow(j); \ + res(5, j) += lcol(5) * rrow(j); \ + res(6, j) += lcol(6) * rrow(j); \ + res(7, j) += lcol(7) * rrow(j); \ + +#define computePass(i) \ + loadData(i, i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); \ + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol +#undef rrow +#undef lhs_element +#undef rhs_element +#undef loadData +#undef computeCol +#undef computePass + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); \ + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); \ + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (2) is slightly faster than (1) due to less branching and more ILP + + // TODO: won't yield much gain, but could just use currently unused shared mem + // and then we won't have to sync + // wait for shared mem to be out of use + __syncthreads(); + +#define writeResultShmem(i, j) \ + lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \ + +#define writeRow(i) \ + writeResultShmem(i, 0); \ + writeResultShmem(i, 1); \ + writeResultShmem(i, 2); \ + writeResultShmem(i, 3); \ + writeResultShmem(i, 4); \ + writeResultShmem(i, 5); \ + writeResultShmem(i, 6); \ + writeResultShmem(i, 7); \ + + if (threadIdx.x == 0) { + writeRow(0); + writeRow(1); + writeRow(2); + writeRow(3); + writeRow(4); + writeRow(5); + writeRow(6); + writeRow(7); + } +#undef writeResultShmem +#undef writeRow + + const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); + const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); + + if (threadIdx.x < max_i_write) { + if (max_j_write == 8) { + // TODO: can i trade bank conflicts for coalesced writes? + Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; + Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; + Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; + Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; + Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; + Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; + Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; + Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; + + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; + } else { +#pragma unroll 7 + for (int j = 0; j < max_j_write; j++) { + Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; + } + } + } +#undef res +} + + +template +__global__ void +__launch_bounds__(512) +EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ Scalar lhs_shmem[72 * 64]; + __shared__ Scalar rhs_shmem[72 * 64]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } +} + + +template +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][16], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + typedef float Scalar; + + // prefetch registers + float4 lhs_pf0, rhs_pf0; + + float4 results[4]; + for (int i=0; i < 4; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg =lhs.loadPacket(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg =lhs.loadPacket(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ + + + Index lhs_vert = base_m+threadIdx.x*4; + + for (Index k = 0; k < k_size; k += 16) { + lhs_pf0 = internal::pset1(0); + rhs_pf0 = internal::pset1(0); + + Index lhs_horiz = threadIdx.y+k; + prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) + + Index rhs_vert = k+(threadIdx.x%4)*4; + Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n; + + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } else { + if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + float x1, x2 ; + // the following can be a bitwise operation..... some day. + if((threadIdx.x%8) < 4) { + x1 = rhs_pf0.y; + x2 = rhs_pf0.w; + } else { + x1 = rhs_pf0.x; + x2 = rhs_pf0.z; + } + x1 = __shfl_xor(x1, 4); + x2 = __shfl_xor(x2, 4); + if((threadIdx.x%8) < 4) { + rhs_pf0.y = x1; + rhs_pf0.w = x2; + } else { + rhs_pf0.x = x1; + rhs_pf0.z = x2; + } + + // We have 64 features. + // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. + // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. + // ... + // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 + // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 + // ... + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); + + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // ... + // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) + // ... + + lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); + + +#define add_vals(fl1, fl2, fr1, fr2)\ + results[0].x += fl1.x * fr1.x;\ + results[0].y += fl1.y * fr1.x;\ + results[0].z += fl2.x * fr1.x;\ + results[0].w += fl2.y * fr1.x;\ +\ + results[1].x += fl1.x * fr1.y;\ + results[1].y += fl1.y * fr1.y;\ + results[1].z += fl2.x * fr1.y;\ + results[1].w += fl2.y * fr1.y;\ +\ + results[2].x += fl1.x * fr2.x;\ + results[2].y += fl1.y * fr2.x;\ + results[2].z += fl2.x * fr2.x;\ + results[2].w += fl2.y * fr2.x;\ +\ + results[3].x += fl1.x * fr2.y;\ + results[3].y += fl1.y * fr2.y;\ + results[3].z += fl2.x * fr2.y;\ + results[3].w += fl2.y * fr2.y;\ + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 16; koff ++) { + // 32 x threads. + float2 fl1 = lhs_shmem2[koff][threadIdx.x]; + float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; + + int start_feature = threadIdx.y * 4; + float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + + add_vals(fl1, fl2, fr1, fr2) + } + __syncthreads(); + } + +#undef prefetch_lhs +#undef add_vals + + Index horiz_base = threadIdx.y*4+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + // CHECK LHS + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK RHS + /* + int ncols_rem = fminf(n_size- horiz_base, 4); + for (int i = 0; i < ncols_rem; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + }*/ + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][32], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + typedef float Scalar; + + // prefetch registers + float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; + float4 rhs_pf0, rhs_pf1; + + float4 results[8]; + for (int i=0; i < 8; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + + Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; + for (Index k = 0; k < k_size; k += 32) { + lhs_pf0 = internal::pset1(0); + lhs_pf1 = internal::pset1(0); + lhs_pf2 = internal::pset1(0); + lhs_pf3 = internal::pset1(0); + + rhs_pf0 = internal::pset1(0); + rhs_pf1 = internal::pset1(0); + + if (!CHECK_LHS_BOUNDARY) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else { + // just CHECK_LHS_BOUNDARY + if (lhs_vert + 3 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 2 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 1 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + } + } else if (lhs_vert < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + } + } + } + __syncthreads(); + Index rhs_vert = k+threadIdx.x*4; + Index rhs_horiz0 = threadIdx.y*2+base_n; + Index rhs_horiz1 = threadIdx.y*2+1+base_n; + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else { + if (rhs_horiz1 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (k+threadIdx.x*4 + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (k+threadIdx.x*4 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + __syncthreads(); + // Loaded. Do computation + // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. + // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. + // .. + // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 + rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); + // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. + // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. + // .. + rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); + // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. + // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. + rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); + // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. + // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. + rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); + + // LHS. + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // ... + // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + + +#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ + results[0].x += a_feat1.x * f1.x;\ + results[1].x += a_feat1.x * f1.y;\ + results[2].x += a_feat1.x * f2.x;\ + results[3].x += a_feat1.x * f2.y;\ + results[4].x += a_feat1.x * f3.x;\ + results[5].x += a_feat1.x * f3.y;\ + results[6].x += a_feat1.x * f4.x;\ + results[7].x += a_feat1.x * f4.y;\ +\ + results[0].y += a_feat1.y * f1.x;\ + results[1].y += a_feat1.y * f1.y;\ + results[2].y += a_feat1.y * f2.x;\ + results[3].y += a_feat1.y * f2.y;\ + results[4].y += a_feat1.y * f3.x;\ + results[5].y += a_feat1.y * f3.y;\ + results[6].y += a_feat1.y * f4.x;\ + results[7].y += a_feat1.y * f4.y;\ +\ + results[0].z += a_feat2.x * f1.x;\ + results[1].z += a_feat2.x * f1.y;\ + results[2].z += a_feat2.x * f2.x;\ + results[3].z += a_feat2.x * f2.y;\ + results[4].z += a_feat2.x * f3.x;\ + results[5].z += a_feat2.x * f3.y;\ + results[6].z += a_feat2.x * f4.x;\ + results[7].z += a_feat2.x * f4.y;\ +\ + results[0].w += a_feat2.y * f1.x;\ + results[1].w += a_feat2.y * f1.y;\ + results[2].w += a_feat2.y * f2.x;\ + results[3].w += a_feat2.y * f2.y;\ + results[4].w += a_feat2.y * f3.x;\ + results[5].w += a_feat2.y * f3.y;\ + results[6].w += a_feat2.y * f4.x;\ + results[7].w += a_feat2.y * f4.y;\ + + lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); + lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); + lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); + + lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); + lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); + lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); + lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 32; koff ++) { + float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; + float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; + + // first feature is at (threadIdx.y/4) * 8 last is at start + 8. + int start_feature = (threadIdx.y / 4) * 8; + + float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; + float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; + float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; + float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; + + add_vals(a3, a4, br1, br2, br3, br4) + } + __syncthreads(); + } // end loop over k + + + __syncthreads(); + Index horiz_base = (threadIdx.y/4)*8+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK BOUNDARY_B + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__global__ void +__launch_bounds__(256) +EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[64*32]; + __shared__ float2 rhs_shmem[128*8]; + + typedef float2 LHS_MEM[64][32]; + typedef float2 RHS_MEM[128][8]; + + typedef float2 LHS_MEM16x16[32][16]; + typedef float2 RHS_MEM16x16[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 128 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + bool check_rhs = (base_n + 63) >= n_size; + bool check_lhs128 = (base_m + 127) >= m_size; + + if (!check_rhs) { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } else { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } +} + +template +__global__ void +__launch_bounds__(256) +EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[32][16]; + __shared__ float2 rhs_shmem[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size) { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } else { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } +} + + +template +struct TensorEvaluator, GpuDevice> : + public TensorContractionEvaluatorBase, GpuDevice> > { + + typedef GpuDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + enum { + Layout = TensorEvaluator::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + + // We need to redefine this method to make nvcc happy + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; + } + } + + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + } + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + LAUNCH_CUDA_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } + }; + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + if (m < 768 || n < 768) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(16, 16, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } + } + }; + + template + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + EIGEN_UNUSED_VARIABLE(k) + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte); + LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_GPU and __CUDACC__ +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h new file mode 100644 index 00000000..4f31a3cb --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -0,0 +1,469 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H + +namespace Eigen { + +namespace internal { + +enum { + Rhs = 0, + Lhs = 1 +}; + +/* + * Implementation of the Eigen blas_data_mapper class for tensors. + */ + +template struct CoeffLoader { + enum { + DirectOffsets = false + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) { + eigen_assert(false && "unsupported"); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename Tensor::PacketReturnType packet(typename Tensor::Index index) const + { + return m_tensor.template packet(index); + } + + + private: + const Tensor m_tensor; +}; + +template struct CoeffLoader { + enum { + DirectOffsets = true + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { + m_data += offset; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename Tensor::PacketReturnType packet(typename Tensor::Index index) const + { + return internal::ploadt_ro(m_data + index); + } + private: + typedef typename Tensor::Scalar Scalar; + const Scalar* m_data; +}; + +template +class SimpleTensorContractionMapper { + public: + EIGEN_DEVICE_FUNC + SimpleTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + m_tensor(tensor), + m_nocontract_strides(nocontract_strides), + m_ij_strides(ij_strides), + m_contract_strides(contract_strides), + m_k_strides(k_strides) { } + + enum { + DirectOffsets = CoeffLoader::DirectOffsets + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { + m_tensor.offsetBuffer(offset); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row) const { + // column major assumption + return operator()(row, 0); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { + return m_tensor.coeff(computeIndex(row, col)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { + const bool left = (side == Lhs); + EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: xxxps://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 + Index nocontract_val = left ? row : col; + Index linidx = 0; + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx = nocontract_val / m_ij_strides[i]; + linidx += idx * m_nocontract_strides[i]; + nocontract_val -= idx * m_ij_strides[i]; + } + if (array_size::value > array_size::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx += nocontract_val; + } else { + linidx += nocontract_val * m_nocontract_strides[0]; + } + } + + Index contract_val = left ? col : row; + if(array_size::value > 0) { + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx = contract_val / m_k_strides[i]; + linidx += idx * m_contract_strides[i]; + contract_val -= idx * m_k_strides[i]; + } + + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx += contract_val; + } else { + linidx += contract_val * m_contract_strides[0]; + } + } + + return linidx; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, const Index distance) const { + const bool left = (side == Lhs); + EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: xxxps://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 + Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; + Index linidx[2] = {0, 0}; + if (array_size::value > array_size::value) { + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx0 = nocontract_val[0] / m_ij_strides[i]; + const Index idx1 = nocontract_val[1] / m_ij_strides[i]; + linidx[0] += idx0 * m_nocontract_strides[i]; + linidx[1] += idx1 * m_nocontract_strides[i]; + nocontract_val[0] -= idx0 * m_ij_strides[i]; + nocontract_val[1] -= idx1 * m_ij_strides[i]; + } + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx[0] += nocontract_val[0]; + linidx[1] += nocontract_val[1]; + } else { + linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; + linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; + } + } + + Index contract_val[2] = {left ? col : row, left ? col : row + distance}; + if (array_size::value> 0) { + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx0 = contract_val[0] / m_k_strides[i]; + const Index idx1 = contract_val[1] / m_k_strides[i]; + linidx[0] += idx0 * m_contract_strides[i]; + linidx[1] += idx1 * m_contract_strides[i]; + contract_val[0] -= idx0 * m_k_strides[i]; + contract_val[1] -= idx1 * m_k_strides[i]; + } + + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx[0] += contract_val[0]; + linidx[1] += contract_val[1]; + } else { + linidx[0] += contract_val[0] * m_contract_strides[0]; + linidx[1] += contract_val[1] * m_contract_strides[0]; + } + } + return IndexPair(linidx[0], linidx[1]); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { + // Only claim alignment when we can compute the actual stride (ie when we're + // dealing with the lhs with inner_dim_contiguous. This is because the + // matrix-vector product relies on the stride when dealing with aligned inputs. + return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { + return ((side == Lhs) && inner_dim_contiguous && array_size::value > 0) ? m_contract_strides[0] : 1; + } + + protected: + CoeffLoader m_tensor; + const nocontract_t m_nocontract_strides; + const nocontract_t m_ij_strides; + const contract_t m_contract_strides; + const contract_t m_k_strides; +}; + + +template +class BaseTensorContractionMapper : public SimpleTensorContractionMapper +{ + public: + typedef SimpleTensorContractionMapper ParentMapper; + + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + typedef typename Tensor::PacketReturnType Packet; + typedef typename unpacket_traits::half HalfPacket; + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + // current code assumes packet size must be a multiple of 2 + EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + + if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { + const Index index = this->computeIndex(i, j); + eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); + return this->m_tensor.template packet(index); + } + + const IndexPair indexPair = this->computeIndexPair(i, j, packet_size - 1); + const Index first = indexPair.first; + const Index last = indexPair.second; + + // We can always do optimized packet reads from left hand side right now, because + // the vertical matrix dimension on the left hand side is never contracting. + // On the right hand side we need to check if the contracting dimensions may have + // been shuffled first. + if (Tensor::PacketAccess && + (side == Lhs || internal::array_size::value <= 1 || !inner_dim_reordered) && + (last - first) == (packet_size - 1)) { + + return this->m_tensor.template packet(first); + } + + EIGEN_ALIGN_MAX Scalar data[packet_size]; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[packet_size - 1] = this->m_tensor.coeff(last); + + return pload(data); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + const Index half_packet_size = unpacket_traits::size; + if (half_packet_size == packet_size) { + return loadPacket(i, j); + } + EIGEN_ALIGN_MAX Scalar data[half_packet_size]; + for (Index k = 0; k < half_packet_size; k++) { + data[k] = operator()(i + k, j); + } + return pload(data); + } +}; + + +template +class BaseTensorContractionMapper : public SimpleTensorContractionMapper +{ + public: + typedef SimpleTensorContractionMapper ParentMapper; + + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + typedef typename Tensor::PacketReturnType Packet; + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + EIGEN_ALIGN_MAX Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload(data); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { + return loadPacket(i, j); + } +}; + + +template +class TensorContractionSubMapper { + public: + typedef typename Tensor::PacketReturnType Packet; + typedef typename unpacket_traits::half HalfPacket; + + typedef BaseTensorContractionMapper ParentMapper; + typedef TensorContractionSubMapper Self; + typedef Self LinearMapper; + + enum { + // We can use direct offsets iff the parent mapper supports then and we can compute the strides. + // TODO: we should also enable direct offsets for the Rhs case. + UseDirectOffsets = ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size::value > 0) + }; + + EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { + // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute + // this offset every time we attempt to access a coefficient. + if (UseDirectOffsets) { + Index stride = m_base_mapper.stride(); + m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride); + } + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper(i, 0); + } + return m_base_mapper(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper(i, j); + } + return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, 0); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, j); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadHalfPacket(i, 0); + } + return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + if (UseDirectOffsets) { + m_base_mapper.storePacket(i, 0, p); + } + m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + if (UseDirectOffsets) { + return LinearMapper(m_base_mapper, i, j); + } + return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { + EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, 0); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { + return false; + } + + private: + ParentMapper m_base_mapper; + const Index m_vert_offset; + const Index m_horiz_offset; +}; + + +template +class TensorContractionInputMapper + : public BaseTensorContractionMapper { + + public: + typedef Scalar_ Scalar; + typedef BaseTensorContractionMapper Base; + typedef TensorContractionSubMapper SubMapper; + typedef SubMapper VectorMapper; + + EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(*this, i, j); + } +}; + + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h new file mode 100644 index 00000000..bbc0c141 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -0,0 +1,1043 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H + +// evaluator for thread pool device +#ifdef EIGEN_USE_THREADS + +namespace Eigen { + +#ifdef EIGEN_USE_SIMPLE_THREAD_POOL +namespace internal { + +template +struct packLhsArg { + LhsScalar* blockA; + const LhsMapper& lhs; + const Index m_start; + const Index k_start; + const Index mc; + const Index kc; +}; + +template +struct packRhsAndKernelArg { + const MaxSizeVector* blockAs; + RhsScalar* blockB; + const RhsMapper& rhs; + OutputMapper& output; + const Index m; + const Index k; + const Index n; + const Index mc; + const Index kc; + const Index nc; + const Index num_threads; + const Index num_blockAs; + const Index max_m; + const Index k_block_idx; + const Index m_block_idx; + const Index n_block_idx; + const Index m_blocks; + const Index n_blocks; + MaxSizeVector* kernel_notifications; + const MaxSizeVector* lhs_notifications; + const bool need_to_pack; +}; + +} // end namespace internal +#endif // EIGEN_USE_SIMPLE_THREAD_POOL + +template +struct TensorEvaluator, ThreadPoolDevice> : + public TensorContractionEvaluatorBase, ThreadPoolDevice> > { + + typedef ThreadPoolDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + enum { + Layout = TensorEvaluator::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::gebp_traits Traits; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + +#ifndef EIGEN_USE_SIMPLE_THREAD_POOL + template + void evalProduct(Scalar* buffer) const { + typedef internal::TensorContractionInputMapper< + LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, + contract_t, internal::packet_traits::size, + lhs_inner_dim_contiguous, false, Unaligned> + LhsMapper; + typedef internal::TensorContractionInputMapper< + RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, + contract_t, internal::packet_traits::size, + rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> + RhsMapper; + typedef internal::blas_data_mapper OutputMapper; + typedef internal::gemm_pack_lhs + LhsPacker; + typedef internal::gemm_pack_rhs< + RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> + RhsPacker; + typedef internal::gebp_kernel + GebpKernel; + + const Index m = this->m_i_size; + const Index n = this->m_j_size; + const Index k = this->m_k_size; + if (m == 0 || n == 0 || k == 0) return; + + // Compute a set of algorithm parameters: + // - kernel block sizes (bm, bn, bk) + // - task grain sizes (number of kernels executed per task: gm, gn) + // - number of threads + // - sharding by row/column + // - parallel packing or first lhs then rhs + // and some derived parameters: + // - number of tasks (nm, nn, nk) + // - number of kernels (nm0, nn0) + // Unfortunately, all these parameters are tightly interdependent. + // So in some cases we first compute approximate values, then compute other + // values based on these approximations and then refine the approximations. + + // There are lots of heuristics here. There is some reasoning behind them, + // but ultimately they are just tuned on contraction benchmarks for + // different input configurations, thread counts and instruction sets. + // So feel free to question any of them. + + // Compute whether we want to shard by row or by column. + // This is a first approximation, it will be refined later. Since we don't + // know number of threads yet we use 2, because what's we are most + // interested in at this point is whether it makes sense to use + // parallelization at all or not. + bool shard_by_col = shardByCol(m, n, 2); + + // First approximation of kernel blocking sizes. + // Again, we don't know number of threads yet, so we use 2. + Index bm, bn, bk; + if (shard_by_col) { + internal::TensorContractionBlocking + blocking(k, m, n, 2); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } else { + internal::TensorContractionBlocking + blocking(k, m, n, 2); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } + + // Compute optimal number of threads. + // Note: we use bk instead of k here because we are interested in amount of + // _parallelizable_ computations, and computations are not parallelizable + // across k dimension. + const TensorOpCost cost = + contractionCost(m, n, bm, bn, bk, shard_by_col, false); + int num_threads = TensorCostModel::numThreads( + static_cast(n) * m, cost, this->m_device.numThreads()); + + // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost + // model is not tuned. Remove this when the cost model is tuned. + if (n == 1) num_threads = 1; + + if (num_threads == 1) { + // The single-threaded algorithm should be faster in this case. + if (n == 1) + this->template evalGemv(buffer); + else + this->template evalGemm(buffer); + return; + } + + // Now that we know number of threads, recalculate sharding and blocking. + shard_by_col = shardByCol(m, n, num_threads); + if (shard_by_col) { + internal::TensorContractionBlocking + blocking(k, m, n, num_threads); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } else { + internal::TensorContractionBlocking + blocking(k, m, n, num_threads); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } + + // Number of kernels for each dimension. + Index nm0 = divup(m, bm); + Index nn0 = divup(n, bn); + Index nk = divup(k, bk); + + // Calculate task grain size (number of kernels executed per task). + // This task size coarsening serves two purposes: + // 1. It reduces per-task overheads including synchronization overheads. + // 2. It allows to use caches better (reuse the same packed rhs in several + // consecutive kernels). + Index gm = 1; + Index gn = 1; + // If we are sharding by column, then we prefer to reduce rows first. + if (shard_by_col) { + gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); + gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); + } else { + gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); + gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); + } + // Number of tasks in each dimension. + Index nm = divup(nm0, gm); + Index nn = divup(nn0, gn); + + // Last by not least, decide whether we want to issue both lhs and rhs + // packing in parallel; or issue lhs packing first, and then issue rhs + // packing when lhs packing completes (for !shard_by_col lhs and rhs are + // swapped). Parallel packing allows more parallelism (for both packing and + // kernels), while sequential packing provides better locality (once + // a thread finishes rhs packing it proceed to kernels with that rhs). + // First, we are interested in parallel packing if there are few tasks. + bool parallel_pack = num_threads >= nm * nn; + // Also do parallel packing if all data fits into L2$. + if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <= + l2CacheSize() * num_threads) + parallel_pack = true; + // But don't do it if we will use each rhs only once. Locality seems to be + // more important in this case. + if ((shard_by_col ? nm : nn) == 1) parallel_pack = false; + + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, + this->m_i_strides, this->m_left_contracting_strides, + this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, + this->m_j_strides, this->m_right_contracting_strides, + this->m_k_strides); + + Context(this->m_device, num_threads, lhs, rhs, buffer, m, n, + k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, + shard_by_col, parallel_pack) + .run(); + } + + // Context coordinates a single parallel gemm operation. + template + class Context { + public: + Context(const Device& device, int num_threads, LhsMapper& lhs, + RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, + Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, + Index gn, Index nm0, Index nn0, bool shard_by_col, + bool parallel_pack) + : device_(device), + lhs_(lhs), + rhs_(rhs), + buffer_(buffer), + output_(buffer, tm), + num_threads_(num_threads), + shard_by_col_(shard_by_col), + parallel_pack_(parallel_pack), + m_(tm), + n_(tn), + k_(tk), + bm_(bm), + bn_(bn), + bk_(bk), + nm_(nm), + nn_(nn), + nk_(nk), + gm_(gm), + gn_(gn), + nm0_(nm0), + nn0_(nn0) + { + for (Index x = 0; x < P; x++) { + // Normal number of notifications for k slice switch is + // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only + // nm_ + nn_ notifications, because they will not receive notifications + // from preceeding kernels. + state_switch_[x] = + x == 0 + ? 1 + : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) + + (x == P - 1 ? nm_ * nn_ : 0); + state_packing_ready_[x] = + parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_); + state_kernel_[x] = new std::atomic*[nm_]; + for (Index m = 0; m < nm_; m++) { + state_kernel_[x][m] = new std::atomic[nn_]; + // Kernels generally receive 3 notifications (previous kernel + 2 + // packing), but the first slice won't get notifications from previous + // kernels. + for (Index n = 0; n < nn_; n++) + state_kernel_[x][m][n].store( + (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1), + std::memory_order_relaxed); + } + } + + // Allocate memory for packed rhs/lhs matrices. + size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + size_t lhs_size = + divup(bm_ * bk_ * sizeof(LhsScalar), align) * align; + size_t rhs_size = + divup(bn_ * bk_ * sizeof(RhsScalar), align) * align; + packed_mem_ = static_cast(internal::aligned_malloc( + (nm0_ * lhs_size + nn0_ * rhs_size) * std::min(nk_, P - 1))); + char* mem = static_cast(packed_mem_); + for (Index x = 0; x < numext::mini(nk_, P - 1); x++) { + packed_lhs_[x].resize(nm0_); + for (Index m = 0; m < nm0_; m++) { + packed_lhs_[x][m] = reinterpret_cast(mem); + mem += lhs_size; + } + packed_rhs_[x].resize(nn0_); + for (Index n = 0; n < nn0_; n++) { + packed_rhs_[x][n] = reinterpret_cast(mem); + mem += rhs_size; + } + } + } + + ~Context() { + for (Index x = 0; x < P; x++) { + for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m]; + delete[] state_kernel_[x]; + } + internal::aligned_free(packed_mem_); + } + + void run() { + // Kick off packing of the first slice. + signal_switch(0, 1); + // Wait for overall completion. + // TODO(dvyukov): this wait can lead to deadlock. + // If nthreads contractions are concurrently submitted from worker + // threads, this wait will block all worker threads and the system will + // deadlock. + done_.Wait(); + } + + private: + Notification done_; + const Device& device_; + LhsMapper& lhs_; + RhsMapper& rhs_; + Scalar* const buffer_; + OutputMapper output_; + const int num_threads_; + const bool shard_by_col_; + const bool parallel_pack_; + // Matrix sizes. + const Index m_; + const Index n_; + const Index k_; + // Block sizes. + const Index bm_; + const Index bn_; + const Index bk_; + // Number of tasks. + const Index nm_; + const Index nn_; + const Index nk_; + // Task grain sizes (number of kernels executed per task). + const Index gm_; + const Index gn_; + // Number of blocks (this is different from ni_/nn_ because of task size + // coarsening). + const Index nm0_; + const Index nn0_; + + // Parallelization strategy. + // + // Blocks related to the same k block can run in parallel because they write + // to different output blocks. So we parallelize within k slices, this + // gives us parallelism level of m x n. Before we can start any kernels + // related to k-th slice, we need to issue m lhs packing tasks and n rhs + // packing tasks. + // + // However, there is a bottleneck when we are finishing kernels for k-th + // slice (at the very end there is only 1 runnable kernel). To mitigate this + // bottleneck we allow kernels from k-th and k+1-th slices to run in + // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same + // output block, so they must not run in parallel. + // + // This gives us the following dependency graph. + // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs + // packing tasks. + // Kernel (m, n, k) can start when: + // - kernel (m, n, k-1) has finished + // - lhs packing (m, k) has finished + // - rhs packing (n, k) has finished + // Lhs/rhs packing can start when: + // - all k-1 packing has finished (artificially imposed to limit amount of + // parallel packing) + // + // On top of that we limit runnable tasks to two consecutive k slices. + // This is done to limit amount of memory we need for packed lhs/rhs + // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_). + // + // state_switch_ tracks when we are ready to switch to the next k slice. + // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n). + // These variable are rolling over 3 consecutive k slices: first two we are + // actively executing + one to track completion of kernels in the second + // slice. + static const Index P = 3; + void* packed_mem_; + std::vector packed_lhs_[P - 1]; + std::vector packed_rhs_[P - 1]; + std::atomic** state_kernel_[P]; + // state_switch_ is frequently modified by worker threads, while other + // fields are read-only after constructor. Let's move it to a separate cache + // line to reduce cache-coherency traffic. + char pad_[128]; + std::atomic state_packing_ready_[P]; + std::atomic state_switch_[P]; + + void pack_lhs(Index m, Index k) { + const Index mend = m * gm_ + gm(m); + for (Index m1 = m * gm_; m1 < mend; m1++) + LhsPacker()(packed_lhs_[k % (P - 1)][m1], + lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); + + if (!parallel_pack_ && shard_by_col_) { + signal_packing(k); + } else { + signal_switch(k + 1); + for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0); + } + } + + void pack_rhs(Index n, Index k) { + const Index nend = n * gn_ + gn(n); + for (Index n1 = n * gn_; n1 < nend; n1++) { + if (k == 0) { + // Zero the output memory in parallel. + // On 10000x2x10000 mm zeroing can easily take half of time. + // Zero (bn x m) row. Safe to do here because all kernels that will + // write to this memory depend on completion of this task. + // Note: don't call device_.memset() here. device_.memset() blocks on + // thread pool worker thread, which can lead to underutilization and + // deadlocks. + memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); + } + RhsPacker()(packed_rhs_[k % (P - 1)][n1], + rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); + } + + if (parallel_pack_ || shard_by_col_) { + signal_switch(k + 1); + for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0); + } else { + signal_packing(k); + } + } + + void kernel(Index m, Index n, Index k) { + // Note: order of iteration matters here. Iteration over m is innermost + // because we want to reuse the same packed rhs in consequetive tasks + // (rhs fits into L2$ while lhs only into L3$). + const Index nend = n * gn_ + gn(n); + const Index mend = m * gm_ + gm(m); + if (shard_by_col_) { + for (Index n1 = n * gn_; n1 < nend; n1++) { + for (Index m1 = m * gm_; m1 < mend; m1++) + GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), + packed_lhs_[k % (P - 1)][m1], + packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), + Scalar(1), -1, -1, 0, 0); + } + } else { + for (Index m1 = m * gm_; m1 < mend; m1++) + for (Index n1 = n * gn_; n1 < nend; n1++) { + GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), + packed_lhs_[k % (P - 1)][m1], + packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), + Scalar(1), -1, -1, 0, 0); + } + } + signal_kernel(m, n, k + 1, false); + signal_switch(k + 2); + } + + void signal_packing(Index k) { + eigen_assert(!parallel_pack_); + Index s = state_packing_ready_[k % P].fetch_sub(1); + eigen_assert(s > 0); + if (s != 1) return; + state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_; + enqueue_packing(k, shard_by_col_); + } + + void signal_kernel(Index m, Index n, Index k, bool sync) { + std::atomic* state = &state_kernel_[k % P][m][n]; + Index s = state->load(); + eigen_assert(s > 0); + if (s != 1 && state->fetch_sub(1) != 1) return; + state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); + if (sync) + kernel(m, n, k); + else + device_.enqueueNoNotification([=]() { kernel(m, n, k); }); + } + + void signal_switch(Index k, Index v = 1) { + Index s = state_switch_[k % P].fetch_sub(v); + eigen_assert(s >= v); + if (s != v) return; + + // Ready to switch to the next k slice. + // Reset counter for the next iteration. + state_switch_[k % P] = + (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) + + nm_ * nn_; + if (k < nk_) { + // Issue lhs/rhs packing. Their completion will in turn kick off + // kernels. + if (parallel_pack_) { + enqueue_packing(k, !shard_by_col_); + enqueue_packing(k, shard_by_col_); + } else if (shard_by_col_) { + enqueue_packing(k, false); + } else { + enqueue_packing(k, true); + } + + // Termination handling. + // Because kernel completion signals k + 2 switch, we need to finish nk + // + 2 slices without issuing any tasks on nk + 1 slice. So here we + // pretend that all nk + 1 packing tasks just finish instantly; so that + // nk + 2 switch only waits for completion of nk kernels. + } else if (k == nk_) { + signal_switch(k + 1, + parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)); + } else { + done_.Notify(); + } + } + + // Enqueue all rhs/lhs packing for k-th slice. + void enqueue_packing(Index k, bool rhs) { + enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs); + } + + void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) { + if (end - start == 1) { + if (rhs) + pack_rhs(start, k); + else + pack_lhs(start, k); + } else { + Index mid = (start + end) / 2; + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(mid, end, k, rhs); }); + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(start, mid, k, rhs); }); + } + } + + // Block sizes with accounting for potentially incomplete last block. + Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; } + Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; } + Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; } + // Task grain sizes accounting for potentially incomplete last task. + Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; } + Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; } + + Context(const Context&) = delete; + void operator=(const Context&) = delete; + }; + + // Decide whether we want to shard m x n contraction by columns or by rows. + static bool shardByCol(Index m, Index n, Index num_threads) { + // Note: we are comparing both n and m against Traits::nr, it is not + // a mistake. We are trying to figure out how both n and m will fit into + // the main sharding dimension. + + // Sharding by column is the default + // ... unless there is enough data for vectorization over rows + if (m / num_threads >= Traits::nr && + // and not enough data for vectorization over columns + (n / num_threads < Traits::nr || + // ... or barely enough data for vectorization over columns, + // but it is not evenly dividable across threads + (n / num_threads < 4 * Traits::nr && + (n % (num_threads * Traits::nr)) != 0 && + // ... and it is evenly dividable across threads for rows + ((m % (num_threads * Traits::nr)) == 0 || + // .. or it is not evenly dividable for both dimensions but + // there is much more data over rows so that corner effects are + // mitigated. + (m / n >= 6))))) + return false; + // Wait, or if matrices are just substantially prolonged over the other + // dimension. + if (n / num_threads < 16 * Traits::nr && m > n * 32) return false; + return true; + } + + Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn, + int num_threads, bool shard_by_col) const { + Index gm = 1; + Index gm1 = 1; + Index nm0 = divup(m, bm); + Index nm1 = nm0; + for (;;) { + // Find the next candidate for m grain size. It needs to result in + // different number of blocks. E.g. if we have 10 kernels, we want to try + // 5 and 10, but not 6, 7, 8 and 9. + while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++; + if (gm1 > nm0) break; + // Check the candidate. + int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, + shard_by_col); + if (res < 0) break; + nm1 = divup(nm0, gm1); + if (res == 0) continue; + // Commit new grain size. + gm = gm1; + } + return gm; + } + + Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm, + int num_threads, bool shard_by_col) const { + Index gn = 1; + Index gn1 = 1; + Index nn0 = divup(n, bn); + Index nn1 = nn0; + for (;;) { + while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++; + if (gn1 > nn0) break; + int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, + shard_by_col); + if (res < 0) break; + nn1 = divup(nn0, gn1); + if (res == 0) continue; + gn = gn1; + } + return gn; + } + + // checkGrain checks whether grain (gm, gn) is suitable and is better than + // (oldgm, oldgn). + int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm, + Index gn, Index oldgm, Index oldgn, int num_threads, + bool shard_by_col) const { + const TensorOpCost cost = + contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true); + double taskSize = TensorCostModel::taskSize( + static_cast(bm) * gm * bn * gn, cost); + // If the task is too small, then we agree on it regardless of anything + // else. Otherwise synchronization overheads will dominate. + if (taskSize < 1) return 1; + // If it is too large, then we reject it and all larger tasks. + if (taskSize > 2) return -1; + // Now we are in presumably good task size range. + // The main deciding factor here is parallelism. Consider that we have 12 + // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes. + // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4 + // of cores will be busy). While grain size 3 gives us 4 tasks, which gives + // us parallelism of 1 (we can load all cores). + Index nm0 = divup(m, bm); + Index nn0 = divup(n, bn); + Index new_tasks = divup(nm0, gm) * divup(nn0, gn); + double new_parallelism = static_cast(new_tasks) / + (divup(new_tasks, num_threads) * num_threads); + Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn); + double old_parallelism = static_cast(old_tasks) / + (divup(old_tasks, num_threads) * num_threads); + if (new_parallelism > old_parallelism || new_parallelism == 1) return 1; + return 0; + } + +#else // EIGEN_USE_SIMPLE_THREAD_POOL + + template + void evalProduct(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv(buffer); + return; + } + + evalGemm(buffer); + } + + template + void evalGemm(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + + const int lhs_packet_size = internal::unpacket_traits::size; + const int rhs_packet_size = internal::unpacket_traits::size; + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + // TODO: packing could be faster sometimes if we supported row major tensor mappers + typedef internal::gemm_pack_lhs LhsPacker; + typedef internal::gemm_pack_rhs RhsPacker; + + // TODO: replace false, false with conjugate values? + typedef internal::gebp_kernel GebpKernel; + + typedef internal::packLhsArg packLArg; + typedef internal::packRhsAndKernelArg packRKArg; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + // compute block sizes (which depend on number of threads) + const Index num_threads = this->m_device.numThreads(); + internal::TensorContractionBlocking blocking(k, m, n, num_threads); + Index mc = blocking.mc(); + Index nc = blocking.nc(); + Index kc = blocking.kc(); + eigen_assert(mc <= m); + eigen_assert(nc <= n); + eigen_assert(kc <= k); + +#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) + const Index k_blocks = CEIL_DIV(k, kc); + const Index n_blocks = CEIL_DIV(n, nc); + const Index m_blocks = CEIL_DIV(m, mc); + const Index sizeA = mc * kc; + const Index sizeB = kc * nc; + + /* cout << "m: " << m << " n: " << n << " k: " << k << endl; + cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl; + cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl; + cout << "num threads: " << num_threads << endl; + */ + + // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB + // aren't 16 byte aligned segfaults will happen due to SIMD instructions + // note: You can get away with allocating just a single blockA and offsets and meet the + // the alignment requirements with the assumption that + // (Traits::mr * sizeof(ResScalar)) % 16 == 0 + const Index numBlockAs = numext::mini(num_threads, m_blocks); + MaxSizeVector blockAs(num_threads); + for (int i = 0; i < num_threads; i++) { + blockAs.push_back(static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); + } + + // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread + // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. + // Other options: (1) reuse memory when a thread finishes. con: tricky + // (2) allocate block B memory in each thread. con: overhead + MaxSizeVector blockBs(n_blocks); + for (int i = 0; i < n_blocks; i++) { + blockBs.push_back(static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); + } + + // lhs_notifications starts with all null Notifications + MaxSizeVector lhs_notifications(num_threads, nullptr); + + // this should really be numBlockAs * n_blocks; + const Index num_kernel_notifications = num_threads * n_blocks; + MaxSizeVector kernel_notifications(num_kernel_notifications, + nullptr); + + for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { + const Index k_start = k_block_idx * kc; + // make sure we don't overshoot right edge of left matrix + const Index actual_kc = numext::mini(k_start + kc, k) - k_start; + + for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { + const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs); + + for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { + const Index m_start = mt_block_idx * mc; + const Index actual_mc = numext::mini(m_start + mc, m) - m_start; + eigen_assert(actual_mc > 0); + + Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; + + for (int i = 0; i < n_blocks; ++i) { + Index notification_id = (blockAId * n_blocks + i); + // Wait for any current kernels using this slot to complete + // before using it. + if (kernel_notifications[notification_id]) { + wait_until_ready(kernel_notifications[notification_id]); + delete kernel_notifications[notification_id]; + } + kernel_notifications[notification_id] = new Notification(); + } + const packLArg arg = { + blockAs[blockAId], // blockA + lhs, // lhs + m_start, // m + k_start, // k + actual_mc, // mc + actual_kc, // kc + }; + + // Delete any existing notification since we may be + // replacing it. The algorithm should ensure that there are + // no existing waiters on this notification. + delete lhs_notifications[blockAId]; + lhs_notifications[blockAId] = + this->m_device.enqueue(&Self::packLhs, arg); + } + + // now start kernels. + const Index m_base_start = m_block_idx * mc; + const bool need_to_pack = m_block_idx == 0; + + for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { + const Index n_start = n_block_idx * nc; + const Index actual_nc = numext::mini(n_start + nc, n) - n_start; + + // first make sure the previous kernels are all done before overwriting rhs. Also wait if + // we're going to start new k. In both cases need_to_pack is true. + if (need_to_pack) { + for (Index i = num_blocks; i < num_threads; ++i) { + Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads; + Index future_id = (blockAId * n_blocks + n_block_idx); + wait_until_ready(kernel_notifications[future_id]); + } + } + + packRKArg arg = { + &blockAs, // blockA + blockBs[n_block_idx], // blockB + rhs, // rhs + output, // output + m_base_start, // m + k_start, // k + n_start, // n + mc, // mc + actual_kc, // kc + actual_nc, // nc + num_threads, + numBlockAs, + m, + k_block_idx, + m_block_idx, + n_block_idx, // n_block_idx + m_blocks, // m_blocks + n_blocks, // n_blocks + &kernel_notifications, // kernel notifications + &lhs_notifications, // lhs notifications + need_to_pack, // need_to_pack + }; + + // We asynchronously kick off this function, which ends up + // notifying the appropriate kernel_notifications objects, + // which this thread waits on before exiting. + this->m_device.enqueueNoNotification(&Self::packRhsAndKernel, arg); + } + } + } + + // Make sure all the kernels are done. + for (size_t i = 0; i < kernel_notifications.size(); ++i) { + wait_until_ready(kernel_notifications[i]); + delete kernel_notifications[i]; + } + + // No need to wait for lhs notifications since they should have + // already been waited on. Just clean them up. + for (size_t i = 0; i < lhs_notifications.size(); ++i) { + delete lhs_notifications[i]; + } + + // deallocate all of the memory for both A and B's + for (size_t i = 0; i < blockAs.size(); i++) { + this->m_device.deallocate(blockAs[i]); + } + for (size_t i = 0; i < blockBs.size(); i++) { + this->m_device.deallocate(blockBs[i]); + } + +#undef CEIL_DIV + } + + /* + * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing + * the LHS block, check that all of the kernels that worked on the same + * mt_block_idx in the previous m_block are done. + */ + template + static void packLhs(const packLArg arg) { + // perform actual packing + LhsPacker pack_lhs; + pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc); + } + + /* + * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that + * all kernels in the previous block are done. + * Then for each LHS future, we wait on the future and then call GEBP + * on the area packed by the future (which starts at + * blockA + future_idx * mt * kc) on the LHS and with the full packed + * RHS block. + * The output of this GEBP is written to output(m + i * mt, n). + */ + template + static void packRhsAndKernel(packRKArg arg) { + if (arg.need_to_pack) { + RhsPacker pack_rhs; + pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc); + } + + GebpKernel gebp; + for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) { + const Index m_base_start = arg.m + arg.mc*mt_block_idx; + if (m_base_start < arg.max_m) { + Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; + wait_until_ready((*arg.lhs_notifications)[blockAId]); + const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start; + gebp(arg.output.getSubMapper(m_base_start, arg.n), + (*arg.blockAs)[blockAId], arg.blockB, + actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0); + + // Notify that the kernel is done. + const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; + (*arg.kernel_notifications)[set_idx]->Notify(); + } + } + } +#endif // EIGEN_USE_SIMPLE_THREAD_POOL + + TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, + bool shard_by_col, bool prepacked) const { + const int packed_size = std::min(PacketType::size, + PacketType::size); + const int output_packet_size = internal::unpacket_traits::size; + const double kd = static_cast(bk); + // Peak VFMA bandwidth is 0.5. However if we have not enough data for + // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined + // experimentally. + double computeBandwidth = bk == 1 ? 4.0 : + (shard_by_col ? bn : bm) < Traits::nr || + (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5; +#ifndef EIGEN_VECTORIZE_FMA + // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. + // However for MULPS/ADDPS we have dependent sequence of 2 such instructions, + // so overall bandwidth is 1.0. + if (computeBandwidth == 0.5) computeBandwidth = 1.0; +#endif + // Computations. + TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size); + // Output stores. + cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); + if (prepacked) { + // Packing and kernels are executed in different tasks. When we calculate + // task grain size we look only at kernel cost assuming that kernel + // is more expensive than packing. + return cost; + } + // Lhs/rhs loads + computations. + TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n); + TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m); + // Lhs packing memory cost does not contribute considerably to overall + // execution time because lhs is prefetched early and accessed sequentially. + if (shard_by_col) + lhsCost.dropMemoryCost(); + else + rhsCost.dropMemoryCost(); + return cost + lhsCost + rhsCost; + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_THREADS +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h new file mode 100644 index 00000000..7c33cc5d --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -0,0 +1,279 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H + +namespace Eigen { + +/** \class TensorConversionOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor conversion class. This class makes it possible to vectorize + * type casting operations when the number of scalars per packet in the source + * and the destination type differ + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef TargetType Scalar; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorConversionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorConversionOp type; +}; + +} // end namespace internal + + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + return internal::pcast(m_impl.template packet(index)); + } + + private: + const TensorEvaluator& m_impl; +}; + + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + + SrcPacket src1 = m_impl.template packet(index); + SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); + TgtPacket result = internal::pcast(src1, src2); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + + SrcPacket src1 = m_impl.template packet(index); + SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); + SrcPacket src3 = m_impl.template packet(index + 2 * SrcPacketSize); + SrcPacket src4 = m_impl.template packet(index + 3 * SrcPacketSize); + TgtPacket result = internal::pcast(src1, src2, src3, src4); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + // Only call m_impl.packet() when we have direct access to the underlying data. This + // ensures that we don't compute the subexpression twice. We may however load some + // coefficients twice, but in practice this doesn't negatively impact performance. + if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) { + // Force unaligned memory loads since we can't ensure alignment anymore + return internal::pcast(m_impl.template packet(index)); + } else { + const int TgtPacketSize = internal::unpacket_traits::size; + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TgtType; + internal::scalar_cast_op converter; + EIGEN_ALIGN_MAX typename internal::unpacket_traits::type values[TgtPacketSize]; + for (int i = 0; i < TgtPacketSize; ++i) { + values[i] = converter(m_impl.coeff(index+i)); + } + TgtPacket rslt = internal::pload(values); + return rslt; + } + } + + private: + const TensorEvaluator& m_impl; + const typename TensorEvaluator::Index m_maxIndex; +}; + +template +class TensorConversionOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::nested::type Nested; + typedef Scalar CoeffReturnType; + typedef typename NumTraits::Real RealScalar; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr) + : m_xpr(xpr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + +template struct ConversionSubExprEval { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) { + impl.evalSubExprsIfNeeded(NULL); + return true; + } +}; + +template struct ConversionSubExprEval { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) { + return impl.evalSubExprsIfNeeded(data); + } +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorConversionOp XprType; + typedef typename XprType::Index Index; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef TargetType Scalar; + typedef TargetType CoeffReturnType; + typedef typename internal::remove_all::Scalar>::type SrcType; + typedef typename PacketType::type PacketReturnType; + typedef typename PacketType::type PacketSourceType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = false, + PacketAccess = true, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) + { + return ConversionSubExprEval::value, TensorEvaluator, Scalar>::run(m_impl, data); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() + { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + internal::scalar_cast_op converter; + return converter(m_impl.coeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const bool Vectorizable = TensorEvaluator::PacketAccess & + internal::type_casting_traits::VectorizedCast; + return PacketConv::run(m_impl, index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double cast_cost = TensorOpCost::CastCost(); + if (vectorized) { + const double SrcCoeffRatio = + internal::type_casting_traits::SrcCoeffRatio; + const double TgtCoeffRatio = + internal::type_casting_traits::TgtCoeffRatio; + return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) + + TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize)); + } else { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost); + } + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + template + struct PacketConv { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { + internal::scalar_cast_op converter; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = converter(impl.coeff(index+i)); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + }; + + template + struct PacketConv { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { + const int SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; + const int TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; + PacketConverter, PacketSourceType, PacketReturnType, + SrcCoeffRatio, TgtCoeffRatio> converter(impl); + return converter.template packet(index); + } + }; + + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h new file mode 100644 index 00000000..98d4dc38 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -0,0 +1,1104 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +namespace internal { + +template +class IndexMapper { + public: + IndexMapper(const InputDims& input_dims, const array& kernel_dims, + const array& indices) { + + array dimensions = input_dims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = indices[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + dimensions[index] = result_dim; + } + + array inputStrides; + array outputStrides; + if (static_cast(Layout) == static_cast(ColMajor)) { + inputStrides[0] = 1; + outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; + outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; + } + } else { + inputStrides[NumDims - 1] = 1; + outputStrides[NumDims - 1] = 1; + for (int i = static_cast(NumDims) - 2; i >= 0; --i) { + inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; + } + } + + array cudaInputDimensions; + array cudaOutputDimensions; + array tmp = dimensions; + array ordering; + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = i + offset; + ordering[index] = indices[i]; + tmp[indices[i]] = -1; + cudaInputDimensions[index] = input_dims[indices[i]]; + cudaOutputDimensions[index] = dimensions[indices[i]]; + } + + int written = static_cast(Layout) == static_cast(ColMajor) + ? NumKernelDims + : 0; + for (int i = 0; i < NumDims; ++i) { + if (tmp[i] >= 0) { + ordering[written] = i; + cudaInputDimensions[written] = input_dims[i]; + cudaOutputDimensions[written] = dimensions[i]; + ++written; + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = inputStrides[ordering[i]]; + m_outputStrides[i] = outputStrides[ordering[i]]; + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (i > NumKernelDims) { + m_cudaInputStrides[i] = + m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1]; + m_cudaOutputStrides[i] = + m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1]; + } else { + m_cudaInputStrides[i] = 1; + m_cudaOutputStrides[i] = 1; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (i + 1 < offset) { + m_cudaInputStrides[i] = + m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; + m_cudaOutputStrides[i] = + m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1]; + } else { + m_cudaInputStrides[i] = 1; + m_cudaOutputStrides[i] = 1; + } + } + } + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_cudaInputStrides[d]; + } + inputIndex += p * m_inputStrides[NumKernelDims]; + } else { + std::ptrdiff_t limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_cudaInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_cudaInputStrides[d]; + } + inputIndex += p * m_inputStrides[limit]; + } + return inputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { + Index outputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_cudaOutputStrides[d]; + } + outputIndex += p * m_outputStrides[NumKernelDims]; + } else { + std::ptrdiff_t limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_cudaOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_cudaOutputStrides[d]; + } + outputIndex += p * m_outputStrides[limit]; + } + return outputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + + k * m_inputStrides[offset + 2]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + + k * m_outputStrides[offset + 2]; + } + + private: + static const int NumDims = internal::array_size::value; + array m_inputStrides; + array m_outputStrides; + array m_cudaInputStrides; + array m_cudaOutputStrides; +}; + + + +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type::ret Scalar; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename InputXprType::Nested LhsNested; + typedef typename KernelXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; + + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorConvolutionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorConvolutionOp type; +}; + +} // end namespace internal + + + +template +class TensorConvolutionOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) + : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& + inputExpression() const { return m_input_xpr; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& + kernelExpression() const { return m_kernel_xpr; } + + protected: + typename InputXprType::Nested m_input_xpr; + typename KernelXprType::Nested m_kernel_xpr; + const Indices m_indices; +}; + + +template +struct TensorEvaluator, Device> +{ + typedef TensorConvolutionOp XprType; + + static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; + } + } + + m_dimensions = m_inputImpl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i > 0) { + m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; + } else { + m_kernelStride[0] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; + } + } else { + for (int i = NumKernelDims - 1; i >= 0; --i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i < NumKernelDims - 1) { + m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; + } else { + m_kernelStride[NumKernelDims - 1] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_inputImpl.evalSubExprsIfNeeded(NULL); + preloadKernel(); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + void evalTo(typename XprType::Scalar* buffer) { + evalSubExprsIfNeeded(NULL); + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + CoeffReturnType result = CoeffReturnType(0); + convolve(firstInput(index), 0, NumKernelDims-1, result); + return result; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const + { + Index indices[2] = {index, index+PacketSize-1}; + Index startInputs[2] = {0, 0}; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } + startInputs[0] += indices[0]; + startInputs[1] += indices[1]; + + if (startInputs[1]-startInputs[0] == PacketSize-1) { + PacketReturnType result = internal::pset1(0); + convolvePacket(startInputs[0], 0, NumKernelDims-1, result); + return result; + } else { + EIGEN_ALIGN_MAX Scalar data[PacketSize]; + data[0] = Scalar(0); + convolve(startInputs[0], 0, NumKernelDims-1, data[0]); + for (int i = 1; i < PacketSize-1; ++i) { + data[i] = Scalar(0); + convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); + } + data[PacketSize-1] = Scalar(0); + convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); + return internal::pload(data); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } + startInput += index; + return startInput; + } + + EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolve(input, kernel, DimIndex-1, accum); + } else { + accum += m_inputImpl.coeff(input) * m_kernel[kernel]; + } + } + } + + template + EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolvePacket(input, kernel, DimIndex-1, accum); + } else { + accum = internal::pmadd(m_inputImpl.template packet(input), internal::pset1(m_kernel[kernel]), accum); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + array m_inputStride; + array m_outputStride; + + array m_indexStride; + array m_kernelStride; + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + Dimensions m_dimensions; + + KernelArgType m_kernelArg; + const Scalar* m_kernel; + bool m_local_kernel; + const Device& m_device; +}; + + + + +// Use an optimized implementation of the evaluation code for GPUs whenever possible. +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + +template +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { + return StaticKernelSize; + } +}; +template <> +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { + return kernelSize; + } +}; + +template +__global__ void EigenConvolutionKernel1D( + InputEvaluator eval, + const internal::IndexMapper + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int kernelSize, float* buffer) { + extern __shared__ float s[]; + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSize); + const int num_x_output = last_x - first_x + 1; + + const int first_plane = blockIdx.y * blockDim.y; + const int plane_stride = blockDim.y * gridDim.y; + + for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { + // Load inputs to shared memory + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.y * num_x_input; + #pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); + s[i + plane_kernel_offset] = eval.coeff(tensor_index); + } + + __syncthreads(); + + // Compute the convolution + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + const int kernel_offset = plane_kernel_offset + i; + float result = 0.0f; + #pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSize); ++k) { + result += s[k + kernel_offset] * kernel[k]; + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); + buffer[tensor_index] = result; + } + __syncthreads(); + } +}; + +template +__global__ void EigenConvolutionKernel2D( + InputEvaluator eval, + const internal::IndexMapper + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int numY, const int maxY, const int kernelSizeX, + const int kernelSizeY, float* buffer) { + extern __shared__ float s[]; + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSizeX); + const int num_x_output = last_x - first_x + 1; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + GetKernelSize()(kernelSizeY); + const int num_y_output = last_y - first_y + 1; + + const int first_plane = blockIdx.z * blockDim.z; + const int plane_stride = blockDim.z * gridDim.z; + + for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { + + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.z * num_y_input; + + // Load inputs to shared memory + #pragma unroll + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + const int input_offset = num_x_input * (j + plane_kernel_offset); + #pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); + s[i + input_offset] = eval.coeff(tensor_index); + } + } + + __syncthreads(); + + // Convolution + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + #pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + #pragma unroll + for (int l = 0; l < GetKernelSize()(kernelSizeY); ++l) { + const int kernel_offset = kernelSizeX * l; + const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); + #pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSizeX); ++k) { + result += s[k + input_offset] * kernel[k + kernel_offset]; + } + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); + buffer[tensor_index] = result; + } + } + + __syncthreads(); + } +}; + +template +__global__ void EigenConvolutionKernel3D( + InputEvaluator eval, + const internal::IndexMapper + indexMapper, + const float* __restrict kernel, const size_t numPlanes, const size_t numX, + const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, + const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, + const size_t kernelSizeZ, float* buffer) { + extern __shared__ float s[]; + + // Load inputs to shared memory + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + kernelSizeX; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + kernelSizeY; + + const int first_z = blockIdx.z * maxZ; + const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; + const int num_z_input = last_z - first_z + kernelSizeZ; + + for (int p = 0; p < numPlanes; ++p) { + + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = 0; + + for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); + s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); + } + } + } + + __syncthreads(); + + // Convolution + const int num_z_output = last_z - first_z + 1; + const int num_y_output = last_y - first_y + 1; + const int num_x_output = last_x - first_x + 1; + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + for (int n = 0; n < kernelSizeZ; ++n) { + for (int m = 0; m < kernelSizeY; ++m) { + for (int l = 0; l < kernelSizeX; ++l) { + result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; + } + } + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); + buffer[tensor_index] = result; + } + } + } + __syncthreads(); + } +}; + + + +template +struct TensorEvaluator, GpuDevice> +{ + typedef TensorConvolutionOp XprType; + + static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename TensorEvaluator::Dimensions KernelDimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) + : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = internal::unpacket_traits::size; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + static unsigned int ceil(unsigned int num, unsigned int denom) { + const unsigned int rounded_toward_zero = num / denom; + if (num > rounded_toward_zero * denom) { + return rounded_toward_zero + 1; + } + return rounded_toward_zero; + } + + void executeEval(Scalar* data) const { + typedef typename TensorEvaluator::Dimensions InputDims; + + const int maxSharedMem = m_device.sharedMemPerBlock(); + const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock(); + const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = m_device.getNumCudaMultiProcessors(); + const int warpSize = 32; + + switch (NumKernelDims) { + case 1: { + const int kernel_size = m_kernelImpl.dimensions().TotalSize(); + + const int numX = dimensions()[m_indices[0]]; + const int numP = dimensions().TotalSize() / numX; + int maxX; + dim3 block_size; + + const int single_stride_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 0 + : m_inputImpl.dimensions().rank() - 1; + if (m_indices[0] == single_stride_dim) { + // Maximum the reuse + const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; + maxX = numext::mini(inner_dim, numX); + const int maxP = numext::mini(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); + block_size.x = numext::mini(maxThreadsPerBlock, maxX); + block_size.y = numext::mini(maxThreadsPerBlock / block_size.x, maxP); + } + else { + // Read as much as possible alongside the inner most dimension, that is the plane + const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); + const int maxP = numext::mini(inner_dim, numP); + maxX = numext::mini(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); + + block_size.x = numext::mini(warpSize, maxX); + block_size.y = numext::mini(maxThreadsPerBlock/block_size.x, maxP); + } + + const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); + + dim3 num_blocks(num_x_blocks, numext::mini(num_y_blocks, ceil(numP, block_size.y))); + + + //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices(m_indices[0]); + const array kernel_dims(m_kernelImpl.dimensions()[0]); + internal::IndexMapper indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + switch(kernel_size) { + case 4: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + break; + } + case 7: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + break; + } + default: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + } + } + break; + } + + case 2: { + const int idxX = + static_cast(Layout) == static_cast(ColMajor) ? 0 : 1; + const int idxY = + static_cast(Layout) == static_cast(ColMajor) ? 1 : 0; + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numP = dimensions().TotalSize() / (numX*numY); + + const float scaling_factor = sqrtf(static_cast(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); + + // Snap maxX to warp size + int inner_dim = ((static_cast(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; + const int maxX = numext::mini(inner_dim, numX); + const int maxY = numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); + const int maxP = numext::mini(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); + + dim3 block_size; + block_size.x = numext::mini(1024, maxX); + block_size.y = numext::mini(1024/block_size.x, maxY); + block_size.z = numext::mini(1024/(block_size.x*block_size.y), maxP); + + const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int num_y_blocks = ceil(numY, maxY); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); + + dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini(num_z_blocks, ceil(numP, block_size.z))); + + + //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices(m_indices[idxX], m_indices[idxY]); + const array kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY]); + internal::IndexMapper indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + switch (kernel_size_x) { + case 4: { + switch (kernel_size_y) { + case 7: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + break; + } + default: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + break; + } + } + break; + } + case 7: { + switch (kernel_size_y) { + case 4: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + break; + } + default: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + break; + } + } + break; + } + default: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + break; + } + } + break; + } + + case 3: { + const int idxX = + static_cast(Layout) == static_cast(ColMajor) ? 0 : 2; + const int idxY = + static_cast(Layout) == static_cast(ColMajor) ? 1 : 1; + const int idxZ = + static_cast(Layout) == static_cast(ColMajor) ? 2 : 0; + + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numZ = dimensions()[m_indices[idxZ]]; + const int numP = dimensions().TotalSize() / (numX*numY*numZ); + + const int maxX = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); + const int maxY = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); + const int maxZ = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); + + dim3 block_size; + block_size.x = numext::mini(32, maxX); + block_size.y = numext::mini(32, maxY); + block_size.z = numext::mini(1024/(block_size.x*block_size.y), maxZ); + dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); + + const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + const array indices(m_indices[idxX], m_indices[idxY], + m_indices[idxZ]); + const array kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY], + m_kernelImpl.dimensions()[idxZ]); + internal::IndexMapper indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf+index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator& operator = (const TensorEvaluator&); + + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + KernelArgType m_kernelArg; + Indices m_indices; + Dimensions m_dimensions; + Scalar* m_buf; + const Scalar* m_kernel; + bool m_local_kernel; + + const GpuDevice& m_device; +}; +#endif + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h new file mode 100644 index 00000000..60cb0458 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -0,0 +1,212 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H +#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief A cost model used to limit the number of threads used for evaluating + * tensor expression. + * + */ + +// Class storing the cost of evaluating a tensor expression in terms of the +// estimated number of operand bytes loads, bytes stored, and compute cycles. +class TensorOpCost { + public: + // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple + // model based on minimal reciprocal throughput numbers from Intel or + // Agner Fog's tables would be better than what is there now. + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() { + return internal::functor_traits< + internal::scalar_product_op >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() { + return internal::functor_traits >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() { + return internal::functor_traits< + internal::scalar_quotient_op >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() { + return internal::functor_traits >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() { + return internal::functor_traits< + internal::scalar_cast_op >::Cost; + } + + EIGEN_DEVICE_FUNC + TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} + EIGEN_DEVICE_FUNC + TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) + : bytes_loaded_(bytes_loaded), + bytes_stored_(bytes_stored), + compute_cycles_(compute_cycles) {} + + EIGEN_DEVICE_FUNC + TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, + bool vectorized, double packet_size) + : bytes_loaded_(bytes_loaded), + bytes_stored_(bytes_stored), + compute_cycles_(vectorized ? compute_cycles / packet_size + : compute_cycles) { + eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded)); + eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored)); + eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { + return bytes_loaded_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { + return bytes_stored_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { + return compute_cycles_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost( + double load_cost, double store_cost, double compute_cost) const { + return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + + compute_cost * compute_cycles_; + } + + // Drop memory access component. Intended for cases when memory accesses are + // sequential or are completely masked by computations. + EIGEN_DEVICE_FUNC void dropMemoryCost() { + bytes_loaded_ = 0; + bytes_stored_ = 0; + } + + // TODO(rmlarsen): Define min in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin( + const TensorOpCost& rhs) const { + double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded()); + double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored()); + double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles()); + return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); + } + + // TODO(rmlarsen): Define max in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax( + const TensorOpCost& rhs) const { + double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); + double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored()); + double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles()); + return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=( + const TensorOpCost& rhs) { + bytes_loaded_ += rhs.bytes_loaded(); + bytes_stored_ += rhs.bytes_stored(); + compute_cycles_ += rhs.compute_cycles(); + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) { + bytes_loaded_ *= rhs; + bytes_stored_ *= rhs; + compute_cycles_ *= rhs; + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+( + TensorOpCost lhs, const TensorOpCost& rhs) { + lhs += rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( + TensorOpCost lhs, double rhs) { + lhs *= rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( + double lhs, TensorOpCost rhs) { + rhs *= lhs; + return rhs; + } + + friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) { + return os << "[bytes_loaded = " << tc.bytes_loaded() + << ", bytes_stored = " << tc.bytes_stored() + << ", compute_cycles = " << tc.compute_cycles() << "]"; + } + + private: + double bytes_loaded_; + double bytes_stored_; + double compute_cycles_; +}; + +// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads +// in [1:max_threads] instead of just switching multi-threading off for small +// work units. +template +class TensorCostModel { + public: + // Scaling from Eigen compute cost to device cycles. + static const int kDeviceCyclesPerComputeCycle = 1; + + // Costs in device cycles. + static const int kStartupCycles = 100000; + static const int kPerThreadCycles = 100000; + static const int kTaskSize = 40000; + + // Returns the number of threads in [1:max_threads] to use for + // evaluating an expression with the given output size and cost per + // coefficient. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( + double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { + double cost = totalCost(output_size, cost_per_coeff); + int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; + return numext::mini(max_threads, numext::maxi(1, threads)); + } + + // taskSize assesses parallel task size. + // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task + // granularity needs to be increased to mitigate parallelization overheads. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize( + double output_size, const TensorOpCost& cost_per_coeff) { + return totalCost(output_size, cost_per_coeff) / kTaskSize; + } + + private: + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( + double output_size, const TensorOpCost& cost_per_coeff) { + // Cost of memory fetches from L2 cache. 64 is typical cache line size. + // 11 is L2 cache latency on Haswell. + // We don't know whether data is in L1, L2 or L3. But we are most interested + // in single-threaded computational time around 100us-10ms (smaller time + // is too small for parallelization, larger time is not intersting + // either because we are probably using all available threads already). + // And for the target time range, L2 seems to be what matters. Data set + // fitting into L1 is too small to take noticeable time. Data set fitting + // only into L3 presumably will take more than 10ms to load and process. + const double kLoadCycles = 1.0 / 64 * 11; + const double kStoreCycles = 1.0 / 64 * 11; + // Scaling from Eigen compute cost to device cycles. + return output_size * + cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, + kDeviceCyclesPerComputeCycle); + } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h new file mode 100644 index 00000000..2e44bcf9 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -0,0 +1,313 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H +#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H + +namespace Eigen { + +/** \class TensorCustomUnaryOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor custom class. + * + * + */ +namespace internal { +template +struct traits > +{ + typedef typename XprType::Scalar Scalar; + typedef typename XprType::StorageKind StorageKind; + typedef typename XprType::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCustomUnaryOp& type; +}; + +template +struct nested > +{ + typedef TensorCustomUnaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCustomUnaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func) + : m_expr(expr), m_func(func) {} + + EIGEN_DEVICE_FUNC + const CustomUnaryFunc& func() const { return m_func; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_expr; } + + protected: + typename XprType::Nested m_expr; + const CustomUnaryFunc m_func; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorCustomUnaryOp ArgType; + typedef typename internal::traits::Index Index; + static const int NumDims = internal::traits::NumDimensions; + typedef DSizes Dimensions; + typedef typename internal::remove_const::type Scalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = false, + PacketAccess = (internal::packet_traits::size > 1), + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) + : m_op(op), m_device(device), m_result(NULL) + { + m_dimensions = op.func().dimensions(op.expression()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast( + m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalTo(m_result); + return true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + if (m_result != NULL) { + m_device.deallocate(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_result[index]; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } + + protected: + EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { + TensorMap > result( + data, m_dimensions); + m_op.func().eval(m_op.expression(), result, m_device); + } + + Dimensions m_dimensions; + const ArgType m_op; + const Device& m_device; + CoeffReturnType* m_result; +}; + + + +/** \class TensorCustomBinaryOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor custom class. + * + * + */ +namespace internal { +template +struct traits > +{ + typedef typename internal::promote_storage_type::ret Scalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCustomBinaryOp& type; +}; + +template +struct nested > +{ + typedef TensorCustomBinaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCustomBinaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::traits::CoeffReturnType CoeffReturnType; + typedef typename internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const CustomBinaryFunc& func) + + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {} + + EIGEN_DEVICE_FUNC + const CustomBinaryFunc& func() const { return m_func; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const CustomBinaryFunc m_func; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorCustomBinaryOp XprType; + typedef typename internal::traits::Index Index; + static const int NumDims = internal::traits::NumDimensions; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = false, + PacketAccess = (internal::packet_traits::size > 1), + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_op(op), m_device(device), m_result(NULL) + { + m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalTo(m_result); + return true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + if (m_result != NULL) { + m_device.deallocate(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_result[index]; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } + + protected: + EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { + TensorMap > result(data, m_dimensions); + m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device); + } + + Dimensions m_dimensions; + const XprType m_op; + const Device& m_device; + CoeffReturnType* m_result; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h new file mode 100644 index 00000000..fd9ecea9 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -0,0 +1,68 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H + +namespace Eigen { + +/** \class TensorDevice + * \ingroup CXX11_Tensor_Module + * + * \brief Pseudo expression providing an operator = that will evaluate its argument + * on the specified computing 'device' (GPU, thread pool, ...) + * + * Example: + * C.device(EIGEN_GPU) = A + B; + * + * Todo: operator *= and /=. + */ + +template class TensorDevice { + public: + TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + template + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + Assign assign(m_expression, other); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + + template + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + + template + EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Difference; + Difference difference(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, difference); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + + protected: + const DeviceType& m_device; + ExpressionType& m_expression; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h new file mode 100644 index 00000000..8e587215 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -0,0 +1,337 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H + +namespace Eigen { + +static const int kCudaScratchSize = 1024; + +// This defines an interface that GPUDevice can take to use +// CUDA streams underneath. +class StreamInterface { + public: + virtual ~StreamInterface() {} + + virtual const cudaStream_t& stream() const = 0; + virtual const cudaDeviceProp& deviceProperties() const = 0; + + // Allocate memory on the actual device where the computation will run + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; + + // Return a scratchpad buffer of size 1k + virtual void* scratchpad() const = 0; + + // Return a semaphore. The semaphore is initially initialized to 0, and + // each kernel using it is responsible for resetting to 0 upon completion + // to maintain the invariant that the semaphore is always equal to 0 upon + // each kernel start. + virtual unsigned int* semaphore() const = 0; +}; + +static cudaDeviceProp* m_deviceProperties; +static bool m_devicePropInitialized = false; + +static void initializeDeviceProp() { + if (!m_devicePropInitialized) { + // Attempts to ensure proper behavior in the case of multiple threads + // calling this function simultaneously. This would be trivial to + // implement if we could use std::mutex, but unfortunately mutex don't + // compile with nvcc, so we resort to atomics and thread fences instead. + // Note that if the caller uses a compiler that doesn't support c++11 we + // can't ensure that the initialization is thread safe. +#if __cplusplus >= 201103L + static std::atomic first(true); + if (first.exchange(false)) { +#else + static bool first = true; + if (first) { + first = false; +#endif + // We're the first thread to reach this point. + int num_devices; + cudaError_t status = cudaGetDeviceCount(&num_devices); + if (status != cudaSuccess) { + std::cerr << "Failed to get the number of CUDA devices: " + << cudaGetErrorString(status) + << std::endl; + assert(status == cudaSuccess); + } + m_deviceProperties = new cudaDeviceProp[num_devices]; + for (int i = 0; i < num_devices; ++i) { + status = cudaGetDeviceProperties(&m_deviceProperties[i], i); + if (status != cudaSuccess) { + std::cerr << "Failed to initialize CUDA device #" + << i + << ": " + << cudaGetErrorString(status) + << std::endl; + assert(status == cudaSuccess); + } + } + +#if __cplusplus >= 201103L + std::atomic_thread_fence(std::memory_order_release); +#endif + m_devicePropInitialized = true; + } else { + // Wait for the other thread to inititialize the properties. + while (!m_devicePropInitialized) { +#if __cplusplus >= 201103L + std::atomic_thread_fence(std::memory_order_acquire); +#endif + sleep(1); + } + } + } +} + +static const cudaStream_t default_stream = cudaStreamDefault; + +class CudaStreamDevice : public StreamInterface { + public: + // Use the default stream on the current device + CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { + cudaGetDevice(&device_); + initializeDeviceProp(); + } + // Use the default stream on the specified device + CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { + initializeDeviceProp(); + } + // Use the specified stream. Note that it's the + // caller responsibility to ensure that the stream can run on + // the specified device. If no device is specified the code + // assumes that the stream is associated to the current gpu device. + CudaStreamDevice(const cudaStream_t* stream, int device = -1) + : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { + if (device < 0) { + cudaGetDevice(&device_); + } else { + int num_devices; + cudaError_t err = cudaGetDeviceCount(&num_devices); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + assert(device < num_devices); + device_ = device; + } + initializeDeviceProp(); + } + + virtual ~CudaStreamDevice() { + if (scratch_) { + deallocate(scratch_); + } + } + + const cudaStream_t& stream() const { return *stream_; } + const cudaDeviceProp& deviceProperties() const { + return m_deviceProperties[device_]; + } + virtual void* allocate(size_t num_bytes) const { + cudaError_t err = cudaSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + void* result; + err = cudaMalloc(&result, num_bytes); + assert(err == cudaSuccess); + assert(result != NULL); + return result; + } + virtual void deallocate(void* buffer) const { + cudaError_t err = cudaSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + assert(buffer != NULL); + err = cudaFree(buffer); + assert(err == cudaSuccess); + } + + virtual void* scratchpad() const { + if (scratch_ == NULL) { + scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + virtual unsigned int* semaphore() const { + if (semaphore_ == NULL) { + char* scratch = static_cast(scratchpad()) + kCudaScratchSize; + semaphore_ = reinterpret_cast(scratch); + cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + } + return semaphore_; + } + + private: + const cudaStream_t* stream_; + int device_; + mutable void* scratch_; + mutable unsigned int* semaphore_; +}; + +struct GpuDevice { + // The StreamInterface is not owned: the caller is + // responsible for its initialization and eventual destruction. + explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { + eigen_assert(stream); + } + explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { + eigen_assert(stream); + } + // TODO(bsteiner): This is an internal API, we should not expose it. + EIGEN_STRONG_INLINE const cudaStream_t& stream() const { + return stream_->stream(); + } + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return stream_->allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + stream_->deallocate(buffer); + } + + EIGEN_STRONG_INLINE void* scratchpad() const { + return stream_->scratchpad(); + } + + EIGEN_STRONG_INLINE unsigned int* semaphore() const { + return stream_->semaphore(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifndef __CUDA_ARCH__ + cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, + stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + cudaError_t err = + cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + } + + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + cudaError_t err = + cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifndef __CUDA_ARCH__ + cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME + return 32; + } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + // FIXME + return 48*1024; + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on cuda devices. + return firstLevelCacheSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { +#if defined(__CUDACC__) && !defined(__CUDA_ARCH__) + cudaError_t err = cudaStreamSynchronize(stream_->stream()); + if (err != cudaSuccess) { + std::cerr << "Error detected in CUDA stream: " + << cudaGetErrorString(err) + << std::endl; + assert(err == cudaSuccess); + } +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { + return stream_->deviceProperties().multiProcessorCount; + } + EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { + return stream_->deviceProperties().maxThreadsPerBlock; + } + EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { + return stream_->deviceProperties().maxThreadsPerMultiProcessor; + } + EIGEN_STRONG_INLINE int sharedMemPerBlock() const { + return stream_->deviceProperties().sharedMemPerBlock; + } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return stream_->deviceProperties().major; + } + EIGEN_STRONG_INLINE int minorDeviceVersion() const { + return stream_->deviceProperties().minor; + } + + EIGEN_STRONG_INLINE int maxBlocks() const { + return max_blocks_; + } + + // This function checks if the CUDA runtime recorded an error for the + // underlying stream device. + inline bool ok() const { +#ifdef __CUDACC__ + cudaError_t error = cudaStreamQuery(stream_->stream()); + return (error == cudaSuccess) || (error == cudaErrorNotReady); +#else + return false; +#endif + } + + private: + const StreamInterface* stream_; + int max_blocks_; +}; + +#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ + assert(cudaGetLastError() == cudaSuccess); + + +// FIXME: Should be device and kernel specific. +#ifdef __CUDACC__ +static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { +#ifndef __CUDA_ARCH__ + cudaError_t status = cudaDeviceSetSharedMemConfig(config); + EIGEN_UNUSED_VARIABLE(status) + assert(status == cudaSuccess); +#else + EIGEN_UNUSED_VARIABLE(config) +#endif +} +#endif + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h new file mode 100644 index 00000000..427772bd --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -0,0 +1,81 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H + + +namespace Eigen { + +// Default device for the machine (typically a single cpu core) +struct DefaultDevice { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { +#ifndef __CUDA_ARCH__ + // Running on the host CPU + return 1; +#else + // Running on a CUDA device + return 32; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { +#ifndef __CUDA_ARCH__ + // Running on the host CPU + return l1CacheSize(); +#else + // Running on a CUDA device, return the amount of shared memory available. + return 48*1024; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { +#ifndef __CUDA_ARCH__ + // Running single threaded on the host CPU + return l3CacheSize(); +#else + // Running on a CUDA device + return firstLevelCacheSize(); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { +#ifndef __CUDA_ARCH__ + // Running single threaded on the host CPU + // Should return an enum that encodes the ISA supported by the CPU + return 1; +#else + // Running on a CUDA device + return __CUDA_ARCH__ / 100; +#endif + } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h new file mode 100644 index 00000000..56181836 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -0,0 +1,122 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Copyright (C) 2016 Benoit Steiner + +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H + +namespace Eigen { +struct SyclDevice { + /// class members + /// sycl queue + mutable cl::sycl::queue m_queue; + /// std::map is the container used to make sure that we create only one buffer + /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. + /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. + mutable std::map> buffer_map; + /// creating device by using selector + template SyclDevice(dev_Selector s) + : +#ifdef EIGEN_EXCEPTIONS + m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) { + for (const auto& e : l) { + try { + std::rethrow_exception(e); + } catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + } + } + })) +#else + m_queue(cl::sycl::queue(s)) +#endif + {} + // destructor + ~SyclDevice() { deallocate_all(); } + + template void deallocate(T *p) const { + auto it = buffer_map.find(p); + if (it != buffer_map.end()) { + buffer_map.erase(it); + internal::aligned_free(p); + } + } + void deallocate_all() const { + std::map>::iterator it=buffer_map.begin(); + while (it!=buffer_map.end()) { + auto p=it->first; + buffer_map.erase(it); + internal::aligned_free(const_cast(p)); + it=buffer_map.begin(); + } + buffer_map.clear(); + } + + /// creation of sycl accessor for a buffer. This function first tries to find + /// the buffer in the buffer_map. If found it gets the accessor from it, if not, + ///the function then adds an entry by creating a sycl buffer for that particular pointer. + template inline cl::sycl::accessor + get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const { + return (get_sycl_buffer(num_bytes, ptr)->template get_access(cgh)); + } + + template inline std::pair>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const { + using Type = cl::sycl::buffer; + std::pair>::iterator,bool> ret = buffer_map.insert(std::pair>(ptr, std::shared_ptr(new Type(cl::sycl::range<1>(num_bytes)), + [](void *dataMem) { delete static_cast(dataMem); }))); + (static_cast(buffer_map.at(ptr).get()))->set_final_data(nullptr); + return ret; + } + + template inline cl::sycl::buffer* get_sycl_buffer(size_t num_bytes,const T * ptr) const { + return static_cast*>(add_sycl_buffer(ptr, num_bytes).first->second.get()); + } + + /// allocating memory on the cpu + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const { + return internal::aligned_malloc(8); + } + + // some runtime conditions that can be applied here + bool isDeviceSuitable() const { return true; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { + ::memcpy(dst, src, n); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const { + auto host_acc= (static_cast*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access(); + memcpy(host_acc.get_pointer(), src, n); + } + /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const { + auto it = buffer_map.find(src); + if (it != buffer_map.end()) { + auto host_acc= (static_cast*>(it->second.get()))-> template get_access(); + memcpy(dst,host_acc.get_pointer(), n); + } else{ + eigen_assert("no device memory found. The memory might be destroyed before creation"); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return 1; + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h new file mode 100644 index 00000000..cfb27a08 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -0,0 +1,282 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H + +namespace Eigen { + +// Use the SimpleThreadPool by default. We'll switch to the new non blocking +// thread pool later. +#ifndef EIGEN_USE_SIMPLE_THREAD_POOL +template using ThreadPoolTempl = NonBlockingThreadPoolTempl; +typedef NonBlockingThreadPool ThreadPool; +#else +template using ThreadPoolTempl = SimpleThreadPoolTempl; +typedef SimpleThreadPool ThreadPool; +#endif + + +// Barrier is an object that allows one or more threads to wait until +// Notify has been called a specified number of times. +class Barrier { + public: + Barrier(unsigned int count) : state_(count << 1), notified_(false) { + eigen_assert(((count << 1) >> 1) == count); + } + ~Barrier() { + eigen_plain_assert((state_>>1) == 0); + } + + void Notify() { + unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2; + if (v != 1) { + eigen_assert(((v + 2) & ~1) != 0); + return; // either count has not dropped to 0, or waiter is not waiting + } + std::unique_lock l(mu_); + eigen_assert(!notified_); + notified_ = true; + cv_.notify_all(); + } + + void Wait() { + unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel); + if ((v >> 1) == 0) return; + std::unique_lock l(mu_); + while (!notified_) { + cv_.wait(l); + } + } + + private: + std::mutex mu_; + std::condition_variable cv_; + std::atomic state_; // low bit is waiter flag + bool notified_; +}; + + +// Notification is an object that allows a user to to wait for another +// thread to signal a notification that an event has occurred. +// +// Multiple threads can wait on the same Notification object, +// but only one caller must call Notify() on the object. +struct Notification : Barrier { + Notification() : Barrier(1) {}; +}; + + +// Runs an arbitrary function and then calls Notify() on the passed in +// Notification. +template struct FunctionWrapperWithNotification +{ + static void run(Notification* n, Function f, Args... args) { + f(args...); + if (n) { + n->Notify(); + } + } +}; + +template struct FunctionWrapperWithBarrier +{ + static void run(Barrier* b, Function f, Args... args) { + f(args...); + if (b) { + b->Notify(); + } + } +}; + +template +static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { + if (n) { + n->Wait(); + } +} + + +// Build a thread pool device on top the an existing pool of threads. +struct ThreadPoolDevice { + // The ownership of the thread pool remains with the caller. + ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { } + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } + + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } + + EIGEN_STRONG_INLINE int numThreads() const { + return num_threads_; + } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + return l1CacheSize(); + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // The l3 cache size is shared between all the cores. + return l3CacheSize() / num_threads_; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { + // Should return an enum that encodes the ISA supported by the CPU + return 1; + } + + template + EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { + Notification* n = new Notification(); + pool_->Schedule(std::bind(&FunctionWrapperWithNotification::run, n, f, args...)); + return n; + } + + template + EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, + Function&& f, + Args&&... args) const { + pool_->Schedule(std::bind( + &FunctionWrapperWithBarrier::run, b, f, args...)); + } + + template + EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { + pool_->Schedule(std::bind(f, args...)); + } + + // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if + // called from one of the threads in pool_. Returns -1 otherwise. + EIGEN_STRONG_INLINE int currentThreadId() const { + return pool_->CurrentThreadId(); + } + + // parallelFor executes f with [0, n) arguments in parallel and waits for + // completion. F accepts a half-open interval [first, last). + // Block size is choosen based on the iteration cost and resulting parallel + // efficiency. If block_align is not nullptr, it is called to round up the + // block size. + void parallelFor(Index n, const TensorOpCost& cost, + std::function block_align, + std::function f) const { + typedef TensorCostModel CostModel; + if (n <= 1 || numThreads() == 1 || + CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { + f(0, n); + return; + } + + // Calculate block size based on (1) the iteration cost and (2) parallel + // efficiency. We want blocks to be not too small to mitigate + // parallelization overheads; not too large to mitigate tail + // effect and potential load imbalance and we also want number + // of blocks to be evenly dividable across threads. + + double block_size_f = 1.0 / CostModel::taskSize(1, cost); + const Index max_oversharding_factor = 4; + Index block_size = numext::mini( + n, numext::maxi(divup(n, max_oversharding_factor * numThreads()), + block_size_f)); + const Index max_block_size = numext::mini(n, 2 * block_size); + if (block_align) { + Index new_block_size = block_align(block_size); + eigen_assert(new_block_size >= block_size); + block_size = numext::mini(n, new_block_size); + } + Index block_count = divup(n, block_size); + // Calculate parallel efficiency as fraction of total CPU time used for + // computations: + double max_efficiency = + static_cast(block_count) / + (divup(block_count, numThreads()) * numThreads()); + // Now try to increase block size up to max_block_size as long as it + // doesn't decrease parallel efficiency. + for (Index prev_block_count = block_count; + max_efficiency < 1.0 && prev_block_count > 1;) { + // This is the next block size that divides size into a smaller number + // of blocks than the current block_size. + Index coarser_block_size = divup(n, prev_block_count - 1); + if (block_align) { + Index new_block_size = block_align(coarser_block_size); + eigen_assert(new_block_size >= coarser_block_size); + coarser_block_size = numext::mini(n, new_block_size); + } + if (coarser_block_size > max_block_size) { + break; // Reached max block size. Stop. + } + // Recalculate parallel efficiency. + const Index coarser_block_count = divup(n, coarser_block_size); + eigen_assert(coarser_block_count < prev_block_count); + prev_block_count = coarser_block_count; + const double coarser_efficiency = + static_cast(coarser_block_count) / + (divup(coarser_block_count, numThreads()) * numThreads()); + if (coarser_efficiency + 0.01 >= max_efficiency) { + // Taking it. + block_size = coarser_block_size; + block_count = coarser_block_count; + if (max_efficiency < coarser_efficiency) { + max_efficiency = coarser_efficiency; + } + } + } + + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + Barrier barrier(static_cast(block_count)); + std::function handleRange; + handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) { + if (last - first <= block_size) { + // Single block or less, execute directly. + f(first, last); + barrier.Notify(); + return; + } + // Split into halves and submit to the pool. + Index mid = first + divup((last - first) / 2, block_size) * block_size; + pool_->Schedule([=, &handleRange]() { handleRange(mid, last); }); + pool_->Schedule([=, &handleRange]() { handleRange(first, mid); }); + }; + handleRange(0, n); + barrier.Wait(); + } + + // Convenience wrapper for parallelFor that does not align blocks. + void parallelFor(Index n, const TensorOpCost& cost, + std::function f) const { + parallelFor(n, cost, nullptr, std::move(f)); + } + + private: + ThreadPoolInterface* pool_; + int num_threads_; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h new file mode 100644 index 00000000..2a4e5b5c --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h @@ -0,0 +1,236 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H +#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H + +namespace Eigen { + +/** \internal + * + * \class TensorDimensionList + * \ingroup CXX11_Tensor_Module + * + * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n. + * + * \sa Tensor + */ + +template struct DimensionList { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + const Index operator[] (const Index i) const { return i; } +}; + +namespace internal { + +template struct array_size > { + static const size_t value = Rank; +}; +template struct array_size > { + static const size_t value = Rank; +}; + +template const Index array_get(DimensionList&) { + return n; +} +template const Index array_get(const DimensionList&) { + return n; +} + + +#if EIGEN_HAS_CONSTEXPR +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { + return true; + } +}; +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { + return true; + } +}; + +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return true; + } +}; +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return true; + } +}; + +template +struct indices_statically_known_to_increase_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return true; + } +}; +template +struct indices_statically_known_to_increase_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return true; + } +}; + +template +struct index_statically_eq_impl > { + static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i == value; + } +}; +template +struct index_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i == value; + } +}; + +template +struct index_statically_ne_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i != value; + } +}; +template +struct index_statically_ne_impl > { + static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i != value; + } +}; + +template +struct index_statically_gt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i > value; + } +}; +template +struct index_statically_gt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i > value; + } +}; + +template +struct index_statically_lt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i < value; + } +}; +template +struct index_statically_lt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i < value; + } +}; + +#else +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { + return true; + } +}; +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { + return true; + } +}; + +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() { + return true; + } +}; +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() { + return true; + } +}; + +template +struct indices_statically_known_to_increase_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { + return true; + } +}; +template +struct indices_statically_known_to_increase_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { + return true; + } +}; + +template +struct index_statically_eq_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; +template +struct index_statically_eq_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; + +template +struct index_statically_ne_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex){ + return false; + } +}; +template +struct index_statically_ne_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; + +template +struct index_statically_gt_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; +template +struct index_statically_gt_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; + +template +struct index_statically_lt_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; +template +struct index_statically_lt_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; +#endif + +} // end namespace internal +} // end namespace Eigen + + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h new file mode 100644 index 00000000..7ffef43e --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -0,0 +1,428 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H + + +namespace Eigen { + +/** \internal + * + * \class TensorDimensions + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode and store the dimensions of a Tensor. + * + * The Sizes class encodes as part of the type the number of dimensions and the + * sizes corresponding to each dimension. It uses no storage space since it is + * entirely known at compile time. + * The DSizes class is its dynamic sibling: the number of dimensions is known + * at compile time but the sizes are set during execution. + * + * \sa Tensor + */ + +// Boilerplate code +namespace internal { + +template struct dget { + static const std::size_t value = get::value; +}; + + +template +struct fixed_size_tensor_index_linearization_helper +{ + template EIGEN_DEVICE_FUNC + static inline Index run(array const& indices, + const Dimensions& dimensions) + { + return array_get(indices) + + dget::value * + fixed_size_tensor_index_linearization_helper::run(indices, dimensions); + } +}; + +template +struct fixed_size_tensor_index_linearization_helper +{ + template EIGEN_DEVICE_FUNC + static inline Index run(array const&, const Dimensions&) + { + return 0; + } +}; + +template +struct fixed_size_tensor_index_extraction_helper +{ + template EIGEN_DEVICE_FUNC + static inline Index run(const Index index, + const Dimensions& dimensions) + { + const Index mult = (index == n-1) ? 1 : 0; + return array_get(dimensions) * mult + + fixed_size_tensor_index_extraction_helper::run(index, dimensions); + } +}; + +template +struct fixed_size_tensor_index_extraction_helper +{ + template EIGEN_DEVICE_FUNC + static inline Index run(const Index, + const Dimensions&) + { + return 0; + } + }; + +} // end namespace internal + + +// Fixed size +#ifndef EIGEN_EMULATE_CXX11_META_H +template +struct Sizes : internal::numeric_list { + typedef internal::numeric_list Base; + static const std::ptrdiff_t total_size = internal::arg_prod(Indices...); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const { + return Base::count; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() { + return internal::arg_prod(Indices...); + } + + EIGEN_DEVICE_FUNC Sizes() { } + template + explicit EIGEN_DEVICE_FUNC Sizes(const array& /*indices*/) { + // todo: add assertion + } +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { } + explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list /*l*/) { + // todo: add assertion + } +#endif + + template Sizes& operator = (const T& /*other*/) { + // add assertion failure if the size of other is different + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const { + return internal::fixed_size_tensor_index_extraction_helper::run(index, *this); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t IndexOfColMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t IndexOfRowMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); + } +}; + +namespace internal { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes&) { + return Sizes::total_size; +} +} + +#else + +template +struct non_zero_size { + typedef internal::type2val type; +}; +template <> +struct non_zero_size<0> { + typedef internal::null_type type; +}; + +template struct Sizes { + typedef typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type Base; + static const size_t count = Base::count; + static const std::size_t total_size = internal::arg_prod::value; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return count; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() { + return internal::arg_prod::value; + } + + Sizes() { } + template + explicit Sizes(const array& /*indices*/) { + // todo: add assertion + } + template Sizes& operator = (const T& /*other*/) { + // add assertion failure if the size of other is different + return *this; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template Sizes(DenseIndex... /*indices*/) { } + explicit Sizes(std::initializer_list) { + // todo: add assertion + } +#else + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) { + } + EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) { + } + EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) { + } + EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { + } + EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index operator[] (const Index index) const { + switch (index) { + case 0: + return internal::get<0, Base>::value; + case 1: + return internal::get<1, Base>::value; + case 2: + return internal::get<2, Base>::value; + case 3: + return internal::get<3, Base>::value; + case 4: + return internal::get<4, Base>::value; + default: + eigen_assert(false && "index overflow"); + return static_cast(-1); + } + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t IndexOfColMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *reinterpret_cast(this)); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t IndexOfRowMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *reinterpret_cast(this)); + } +}; + +namespace internal { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes&) { + return Sizes::total_size; +} +} + +#endif + +// Boilerplate +namespace internal { +template +struct tensor_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, array const& dimensions) + { + return array_get(indices) + + array_get(dimensions) * + tensor_index_linearization_helper::run(indices, dimensions); + } +}; + +template +struct tensor_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, array const&) + { + return array_get(indices); + } +}; +} // end namespace internal + + + +// Dynamic size +template +struct DSizes : array { + typedef array Base; + static const int count = NumDims; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return NumDims; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const { + return (NumDims == 0) ? 1 : internal::array_prod(*static_cast(this)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = 0; + } + } + EIGEN_DEVICE_FUNC explicit DSizes(const array& a) : Base(a) { } + + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { + eigen_assert(NumDims == 1); + (*this)[0] = i0; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) { + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#else + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) { + eigen_assert(NumDims == 2); + (*this)[0] = i0; + (*this)[1] = i1; + } + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + eigen_assert(NumDims == 3); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + } + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + eigen_assert(NumDims == 4); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + } + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + eigen_assert(NumDims == 5); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + (*this)[4] = i4; + } +#endif + + EIGEN_DEVICE_FUNC DSizes& operator = (const array& other) { + *static_cast(this) = other; + return *this; + } + + // A constexpr would be so much better here + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array& indices) const { + return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array& indices) const { + return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); + } +}; + + + + +// Boilerplate +namespace internal { +template +struct tensor_vsize_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, std::vector const& dimensions) + { + return array_get(indices) + + array_get(dimensions) * + tensor_vsize_index_linearization_helper::run(indices, dimensions); + } +}; + +template +struct tensor_vsize_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, std::vector const&) + { + return array_get(indices); + } +}; +} // end namespace internal + + +namespace internal { + +template struct array_size > { + static const size_t value = NumDims; +}; +template struct array_size > { + static const size_t value = NumDims; +}; +#ifndef EIGEN_EMULATE_CXX11_META_H +template struct array_size > { +static const std::ptrdiff_t value = Sizes::count; +}; +template struct array_size > { +static const std::ptrdiff_t value = Sizes::count; +}; +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes&) { + return get >::value; +} +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) { + eigen_assert(false && "should never be called"); + return -1; +} +#else +template struct array_size > { + static const size_t value = Sizes::count; +}; +template struct array_size > { + static const size_t value = Sizes::count; +}; +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes&) { + return get::Base>::value; +} + +#endif + + +template +struct sizes_match_below_dim { + static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { + return false; + } +}; +template +struct sizes_match_below_dim { + static EIGEN_DEVICE_FUNC inline bool run(Dims1& dims1, Dims2& dims2) { + return (array_get(dims1) == array_get(dims2)) & + sizes_match_below_dim::run(dims1, dims2); + } +}; +template +struct sizes_match_below_dim { + static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { + return true; + } +}; + +} // end namespace internal + + +template +EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) { + return internal::sizes_match_below_dim::value, internal::array_size::value>::run(dims1, dims2); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h new file mode 100644 index 00000000..5bd24881 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -0,0 +1,181 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template class MakePointer_> +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0 + }; + template + struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + }; +}; + +template class MakePointer_> +struct eval, Eigen::Dense> +{ + typedef const TensorEvalToOp& type; +}; + +template class MakePointer_> +struct nested, 1, typename eval >::type> +{ + typedef TensorEvalToOp type; +}; + +} // end namespace internal + + + + +template class MakePointer_> +class TensorEvalToOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename MakePointer_::Type PointerType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr) + : m_xpr(expr), m_buffer(buffer) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; } + + protected: + typename XprType::Nested m_xpr; + PointerType m_buffer; +}; + + + +template class MakePointer_> +struct TensorEvaluator, Device> +{ + typedef TensorEvalToOp XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = true + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), + m_buffer(op.buffer()), m_op(op), m_expression(op.expression()) + { } + + // Used for accessor extraction in SYCL Managed TensorMap: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const { + return m_op; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { + } + + typedef typename internal::traits >::template MakePointer::Type DevicePointer; + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) { + EIGEN_UNUSED_VARIABLE(scalar); + eigen_assert(scalar == NULL); + return m_impl.evalSubExprsIfNeeded(m_buffer); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + m_buffer[i] = m_impl.coeff(i); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_buffer[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return internal::ploadt(m_buffer + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here. + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; } + ArgType expression() const { return m_expression; } + + /// required by sycl in order to extract the accessor + const TensorEvaluator& impl() const { return m_impl; } + /// added for sycl in order to construct the buffer from the sycl device + const Device& device() const{return m_device;} + + private: + TensorEvaluator m_impl; + const Device& m_device; + DevicePointer m_buffer; + const XprType& m_op; + const ArgType m_expression; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h new file mode 100644 index 00000000..d087ca4c --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -0,0 +1,633 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor evaluator classes. + * + * These classes are responsible for the evaluation of the tensor expression. + * + * TODO: add support for more types of expressions, in particular expressions + * leading to lvalues (slicing, reshaping, etc...) + */ + +// Generic evaluator +template +struct TensorEvaluator +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + // NumDimensions is -1 for variable dim tensors + static const int NumCoords = internal::traits::NumDimensions > 0 ? + internal::traits::NumDimensions : 0; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = (internal::unpacket_traits::size > 1), + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + RawAccess = true + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(const_cast::template MakePointer::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m) + { } + + // Used for accessor extraction in SYCL Managed TensorMap: + const Derived& derived() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { + if (dest) { + m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); + return false; + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data); + return m_data[index]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + eigen_assert(m_data); + return m_data[index]; + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + return internal::ploadt(m_data + index); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + return internal::pstoret(m_data + index, x); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { + eigen_assert(m_data); + if (static_cast(Layout) == static_cast(ColMajor)) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& coords) { + eigen_assert(m_data); + if (static_cast(Layout) == static_cast(ColMajor)) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits::size); + } + + EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } + + /// required by sycl in order to construct sycl buffer from raw pointer + const Device& device() const{return m_device;} + + protected: + typename internal::traits::template MakePointer::Type m_data; + Dimensions m_dims; + const Device& m_device; + const Derived& m_impl; +}; + +namespace { +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T loadConstant(const T* address) { + return *address; +} +// Use the texture cache on CUDA devices whenever possible +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float loadConstant(const float* address) { + return __ldg(address); +} +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double loadConstant(const double* address) { + return __ldg(address); +} +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +Eigen::half loadConstant(const Eigen::half* address) { + return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); +} +#endif +} + + +// Default evaluator for rvalues +template +struct TensorEvaluator +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + // NumDimensions is -1 for variable dim tensors + static const int NumCoords = internal::traits::NumDimensions > 0 ? + internal::traits::NumDimensions : 0; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = (internal::unpacket_traits::size > 1), + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + RawAccess = true + }; + + // Used for accessor extraction in SYCL Managed TensorMap: + const Derived& derived() const { return m_impl; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + if (!NumTraits::type>::RequireInitialization && data) { + m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar)); + return false; + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data); + return loadConstant(m_data+index); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + return internal::ploadt_ro(m_data + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { + eigen_assert(m_data); + const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords) + : m_dims.IndexOfRowMajor(coords); + return loadConstant(m_data+index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits::size); + } + + EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } + + /// added for sycl in order to construct the buffer from the sycl device + const Device& device() const{return m_device;} + + protected: + typename internal::traits::template MakePointer::Type m_data; + Dimensions m_dims; + const Device& m_device; + const Derived& m_impl; +}; + + + + +// -------------------- CwiseNullaryOp -------------------- + +template +struct TensorEvaluator, Device> +{ + typedef TensorCwiseNullaryOp XprType; + + enum { + IsAligned = true, + PacketAccess = internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC + TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_wrapper(m_functor, index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_wrapper.template packetOp(m_functor, index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits::size); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + /// required by sycl in order to extract the accessor + const TensorEvaluator& impl() const { return m_argImpl; } + /// required by sycl in order to extract the accessor + NullaryOp functor() const { return m_functor; } + + + private: + const NullaryOp m_functor; + TensorEvaluator m_argImpl; + const internal::nullary_wrapper m_wrapper; +}; + + + +// -------------------- CwiseUnaryOp -------------------- + +template +struct TensorEvaluator, Device> +{ + typedef TensorCwiseUnaryOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), + m_argImpl(op.nestedExpression(), device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_argImpl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_argImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(m_argImpl.coeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_argImpl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_argImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + /// required by sycl in order to extract the accessor + const TensorEvaluator & impl() const { return m_argImpl; } + /// added for sycl in order to construct the buffer from sycl device + UnaryOp functor() const { return m_functor; } + + + private: + const UnaryOp m_functor; + TensorEvaluator m_argImpl; +}; + + +// -------------------- CwiseBinaryOp -------------------- + +template +struct TensorEvaluator, Device> +{ + typedef TensorCwiseBinaryOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & + internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || internal::traits::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use right impl instead if right impl dimensions are known at compile time. + return m_leftImpl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_leftImpl.template packet(index), m_rightImpl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_leftImpl.costPerCoeff(vectorized) + + m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& left_impl() const { return m_leftImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& right_impl() const { return m_rightImpl; } + /// required by sycl in order to extract the accessor + BinaryOp functor() const { return m_functor; } + + private: + const BinaryOp m_functor; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; +}; + +// -------------------- CwiseTernaryOp -------------------- + +template +struct TensorEvaluator, Device> +{ + typedef TensorCwiseTernaryOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & + internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), + m_arg1Impl(op.arg1Expression(), device), + m_arg2Impl(op.arg2Expression(), device), + m_arg3Impl(op.arg3Expression(), device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || internal::traits::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, + typename internal::traits::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, + typename internal::traits::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::Index, + typename internal::traits::Index>::value), + STORAGE_INDEX_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::Index, + typename internal::traits::Index>::value), + STORAGE_INDEX_MUST_MATCH) + + eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use arg2 or arg3 dimensions if they are known at compile time. + return m_arg1Impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_arg1Impl.evalSubExprsIfNeeded(NULL); + m_arg2Impl.evalSubExprsIfNeeded(NULL); + m_arg3Impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_arg1Impl.cleanup(); + m_arg2Impl.cleanup(); + m_arg3Impl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_arg1Impl.template packet(index), + m_arg2Impl.template packet(index), + m_arg3Impl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_arg1Impl.costPerCoeff(vectorized) + + m_arg2Impl.costPerCoeff(vectorized) + + m_arg3Impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + /// required by sycl in order to extract the accessor + const TensorEvaluator & arg1Impl() const { return m_arg1Impl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& arg2Impl() const { return m_arg2Impl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& arg3Impl() const { return m_arg3Impl; } + + private: + const TernaryOp m_functor; + TensorEvaluator m_arg1Impl; + TensorEvaluator m_arg2Impl; + TensorEvaluator m_arg3Impl; +}; + + +// -------------------- SelectOp -------------------- + +template +struct TensorEvaluator, Device> +{ + typedef TensorSelectOp XprType; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & + internal::packet_traits::HasBlend, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_condImpl(op.ifExpression(), device), + m_thenImpl(op.thenExpression(), device), + m_elseImpl(op.elseExpression(), device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); + eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use then or else impl instead if they happen to be known at compile time. + return m_condImpl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_condImpl.evalSubExprsIfNeeded(NULL); + m_thenImpl.evalSubExprsIfNeeded(NULL); + m_elseImpl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_condImpl.cleanup(); + m_thenImpl.cleanup(); + m_elseImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); + } + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + internal::Selector select; + for (Index i = 0; i < PacketSize; ++i) { + select.select[i] = m_condImpl.coeff(index+i); + } + return internal::pblend(select, + m_thenImpl.template packet(index), + m_elseImpl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return m_condImpl.costPerCoeff(vectorized) + + m_thenImpl.costPerCoeff(vectorized) + .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; } + /// required by sycl in order to extract the accessor + const TensorEvaluator & cond_impl() const { return m_condImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& then_impl() const { return m_thenImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& else_impl() const { return m_elseImpl; } + + private: + TensorEvaluator m_condImpl; + TensorEvaluator m_thenImpl; + TensorEvaluator m_elseImpl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h new file mode 100644 index 00000000..77f9c7c5 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -0,0 +1,288 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H + +namespace Eigen { + +/** \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + */ +namespace internal { + +// Default strategy: the expression is evaluated with a single cpu thread. +template +class TensorExecutor +{ + public: + typedef typename Expression::Index Index; + EIGEN_DEVICE_FUNC + static inline void run(const Expression& expr, const Device& device = Device()) + { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); + for (Index i = 0; i < size; ++i) { + evaluator.evalScalar(i); + } + } + evaluator.cleanup(); + } +}; + + +template +class TensorExecutor +{ + public: + typedef typename Expression::Index Index; + EIGEN_DEVICE_FUNC + static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) + { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); + const int PacketSize = unpacket_traits::PacketReturnType>::size; + // Give the compiler a strong hint to unroll the loop. But don't insist + // on unrolling, because if the function is expensive the compiler should not + // unroll the loop at the expense of inlining. + const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; + for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { + for (Index j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } + } + const Index VectorizedSize = (size / PacketSize) * PacketSize; + for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { + evaluator.evalPacket(i); + } + for (Index i = VectorizedSize; i < size; ++i) { + evaluator.evalScalar(i); + } + } + evaluator.cleanup(); + } +}; + + + +// Multicore strategy: the index space is partitioned and each partition is executed on a single core +#ifdef EIGEN_USE_THREADS +template +struct EvalRange { + static void run(Evaluator* evaluator_in, const Index first, const Index last) { + Evaluator evaluator = *evaluator_in; + eigen_assert(last >= first); + for (Index i = first; i < last; ++i) { + evaluator.evalScalar(i); + } + } + + static Index alignBlockSize(Index size) { + return size; + } +}; + +template +struct EvalRange { + static const int PacketSize = unpacket_traits::size; + + static void run(Evaluator* evaluator_in, const Index first, const Index last) { + Evaluator evaluator = *evaluator_in; + eigen_assert(last >= first); + Index i = first; + if (last - first >= PacketSize) { + eigen_assert(first % PacketSize == 0); + Index last_chunk_offset = last - 4 * PacketSize; + // Give the compiler a strong hint to unroll the loop. But don't insist + // on unrolling, because if the function is expensive the compiler should not + // unroll the loop at the expense of inlining. + for (; i <= last_chunk_offset; i += 4*PacketSize) { + for (Index j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } + } + last_chunk_offset = last - PacketSize; + for (; i <= last_chunk_offset; i += PacketSize) { + evaluator.evalPacket(i); + } + } + for (; i < last; ++i) { + evaluator.evalScalar(i); + } + } + + static Index alignBlockSize(Index size) { + // Align block size to packet size and account for unrolling in run above. + if (size >= 16 * PacketSize) { + return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); + } + // Aligning to 4 * PacketSize would increase block size by more than 25%. + return (size + PacketSize - 1) & ~(PacketSize - 1); + } +}; + +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const ThreadPoolDevice& device) + { + typedef TensorEvaluator Evaluator; + Evaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); +#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) + device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), + EvalRange::alignBlockSize, + [&evaluator](Index first, Index last) { + EvalRange::run(&evaluator, first, last); + }); +#else + size_t num_threads = device.numThreads(); + if (num_threads > 1) { + num_threads = TensorCostModel::numThreads( + size, evaluator.costPerCoeff(Vectorizable), num_threads); + } + if (num_threads == 1) { + EvalRange::run(&evaluator, 0, size); + } else { + const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; + Index blocksz = std::ceil(static_cast(size)/num_threads) + PacketSize - 1; + const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + Barrier barrier(numblocks); + for (int i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier( + &barrier, &EvalRange::run, + &evaluator, i * blocksize, (i + 1) * blocksize); + } + if (numblocks * blocksize < size) { + EvalRange::run( + &evaluator, numblocks * blocksize, size); + } + barrier.Wait(); + } +#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) + } + evaluator.cleanup(); + } +}; +#endif // EIGEN_USE_THREADS + + +// GPU: the evaluation of the expression is offloaded to a GPU. +#if defined(EIGEN_USE_GPU) + +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + static void run(const Expression& expr, const GpuDevice& device); +}; + + +#if defined(__CUDACC__) +template +struct EigenMetaKernelEval { + static __device__ EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, Index first, Index last, Index step_size) { + for (Index i = first; i < last; i += step_size) { + eval.evalScalar(i); + } + } +}; + +template +struct EigenMetaKernelEval { + static __device__ EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, Index first, Index last, Index step_size) { + const Index PacketSize = unpacket_traits::size; + const Index vectorized_size = (last / PacketSize) * PacketSize; + const Index vectorized_step_size = step_size * PacketSize; + + // Use the vector path + for (Index i = first * PacketSize; i < vectorized_size; + i += vectorized_step_size) { + eval.evalPacket(i); + } + for (Index i = vectorized_size + first; i < last; i += step_size) { + eval.evalScalar(i); + } + } +}; + +template +__global__ void +__launch_bounds__(1024) +EigenMetaKernel(Evaluator eval, Index size) { + + const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; + const Index step_size = blockDim.x * gridDim.x; + + const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; + EigenMetaKernelEval::run(eval, first_index, size, step_size); +} + +/*static*/ +template +inline void TensorExecutor::run( + const Expression& expr, const GpuDevice& device) { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + const int block_size = device.maxCudaThreadsPerBlock(); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; + const Index size = array_prod(evaluator.dimensions()); + // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. + const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); + + LAUNCH_CUDA_KERNEL( + (EigenMetaKernel, Index>), + num_blocks, block_size, 0, device, evaluator, size); + } + evaluator.cleanup(); +} + +#endif // __CUDACC__ +#endif // EIGEN_USE_GPU + +// SYCL Executor policy +#ifdef EIGEN_USE_SYCL + +template +class TensorExecutor { +public: + static inline void run(const Expression &expr, const SyclDevice &device) { + // call TensorSYCL module + TensorSycl::run(expr, device); + } +}; + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h new file mode 100644 index 00000000..5267b70d --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -0,0 +1,371 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H + +namespace Eigen { + +/** \class TensorExpr + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor expression classes. + * + * The TensorCwiseNullaryOp class applies a nullary operators to an expression. + * This is typically used to generate constants. + * + * The TensorCwiseUnaryOp class represents an expression where a unary operator + * (e.g. cwiseSqrt) is applied to an expression. + * + * The TensorCwiseBinaryOp class represents an expression where a binary + * operator (e.g. addition) is applied to a lhs and a rhs expression. + * + */ +namespace internal { +template +struct traits > + : traits +{ + typedef traits XprTraits; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::Nested XprTypeNested; + typedef typename remove_reference::type _XprTypeNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0 + }; +}; + +} // end namespace internal + + + +template +class TensorCwiseNullaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef TensorCwiseNullaryOp Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + const NullaryOp& functor() const { return m_functor; } + + protected: + typename XprType::Nested m_xpr; + const NullaryOp m_functor; +}; + + + +namespace internal { +template +struct traits > + : traits +{ + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. + typedef typename result_of::type Scalar; + typedef traits XprTraits; + typedef typename XprType::Nested XprTypeNested; + typedef typename remove_reference::type _XprTypeNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCwiseUnaryOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorCwiseUnaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCwiseUnaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const UnaryOp& functor() const { return m_functor; } + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const UnaryOp m_functor; +}; + + +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs + // are different. + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. + typedef typename result_of< + BinaryOp(typename LhsXprType::Scalar, + typename RhsXprType::Scalar)>::type Scalar; + typedef traits XprTraits; + typedef typename promote_storage_type< + typename traits::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type< + typename traits::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCwiseBinaryOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorCwiseBinaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCwiseBinaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const BinaryOp& functor() const { return m_functor; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const BinaryOp m_functor; +}; + + +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the args are different. + typedef typename result_of< + TernaryOp(typename Arg1XprType::Scalar, + typename Arg2XprType::Scalar, + typename Arg3XprType::Scalar)>::type Scalar; + typedef traits XprTraits; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename Arg1XprType::Nested Arg1Nested; + typedef typename Arg2XprType::Nested Arg2Nested; + typedef typename Arg3XprType::Nested Arg3Nested; + typedef typename remove_reference::type _Arg1Nested; + typedef typename remove_reference::type _Arg2Nested; + typedef typename remove_reference::type _Arg3Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCwiseTernaryOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorCwiseTernaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCwiseTernaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp()) + : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const TernaryOp& functor() const { return m_functor; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + arg1Expression() const { return m_arg1_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + arg2Expression() const { return m_arg2_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + arg3Expression() const { return m_arg3_xpr; } + + protected: + typename Arg1XprType::Nested m_arg1_xpr; + typename Arg2XprType::Nested m_arg2_xpr; + typename Arg3XprType::Nested m_arg3_xpr; + const TernaryOp m_functor; +}; + + +namespace internal { +template +struct traits > + : traits +{ + typedef typename traits::Scalar Scalar; + typedef traits XprTraits; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename IfXprType::Nested IfNested; + typedef typename ThenXprType::Nested ThenNested; + typedef typename ElseXprType::Nested ElseNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorSelectOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorSelectOp type; +}; + +} // end namespace internal + + +template +class TensorSelectOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC + TensorSelectOp(const IfXprType& a_condition, + const ThenXprType& a_then, + const ElseXprType& a_else) + : m_condition(a_condition), m_then(a_then), m_else(a_else) + { } + + EIGEN_DEVICE_FUNC + const IfXprType& ifExpression() const { return m_condition; } + + EIGEN_DEVICE_FUNC + const ThenXprType& thenExpression() const { return m_then; } + + EIGEN_DEVICE_FUNC + const ElseXprType& elseExpression() const { return m_else; } + + protected: + typename IfXprType::Nested m_condition; + typename ThenXprType::Nested m_then; + typename ElseXprType::Nested m_else; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h new file mode 100644 index 00000000..473e4940 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -0,0 +1,651 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Jianwei Cui +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H +#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H + +// This code requires the ability to initialize arrays of constant +// values directly inside a class. +#if __cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1900 + +namespace Eigen { + +/** \class TensorFFT + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor FFT class. + * + * TODO: + * Vectorize the Cooley Tukey and the Bluestein algorithm + * Add support for multithreaded evaluation + * Improve the performance on GPU + */ + +template struct MakeComplex { + template + EIGEN_DEVICE_FUNC + T operator() (const T& val) const { return val; } +}; + +template <> struct MakeComplex { + template + EIGEN_DEVICE_FUNC + std::complex operator() (const T& val) const { return std::complex(val, 0); } +}; + +template <> struct MakeComplex { + template + EIGEN_DEVICE_FUNC + std::complex operator() (const std::complex& val) const { return val; } +}; + +template struct PartOf { + template T operator() (const T& val) const { return val; } +}; + +template <> struct PartOf { + template T operator() (const std::complex& val) const { return val.real(); } +}; + +template <> struct PartOf { + template T operator() (const std::complex& val) const { return val.imag(); } +}; + +namespace internal { +template +struct traits > : public traits { + typedef traits XprTraits; + typedef typename NumTraits::Real RealScalar; + typedef typename std::complex ComplexScalar; + typedef typename XprTraits::Scalar InputScalar; + typedef typename conditional::type OutputScalar; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorFFTOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorFFTOp type; +}; + +} // end namespace internal + +template +class TensorFFTOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename std::complex ComplexScalar; + typedef typename internal::conditional::type OutputScalar; + typedef OutputScalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft) + : m_xpr(expr), m_fft(fft) {} + + EIGEN_DEVICE_FUNC + const FFT& fft() const { return m_fft; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& expression() const { + return m_xpr; + } + + protected: + typename XprType::Nested m_xpr; + const FFT m_fft; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorFFTOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename std::complex ComplexScalar; + typedef typename TensorEvaluator::Dimensions InputDimensions; + typedef internal::traits XprTraits; + typedef typename XprTraits::Scalar InputScalar; + typedef typename internal::conditional::type OutputScalar; + typedef OutputScalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = false, + PacketAccess = true, + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + eigen_assert(input_dims[i] > 0); + m_dimensions[i] = input_dims[i]; + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + } + } + m_size = m_dimensions.TotalSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_dimensions; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) { + m_impl.evalSubExprsIfNeeded(NULL); + if (data) { + evalToBuf(data); + return false; + } else { + m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size); + evalToBuf(m_data); + return true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + if (m_data) { + m_device.deallocate(m_data); + m_data = NULL; + } + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { + return m_data[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType + packet(Index index) const { + return internal::ploadt(m_data + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } + + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) { + const bool write_to_out = internal::is_same::value; + ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size); + + for (Index i = 0; i < m_size; ++i) { + buf[i] = MakeComplex::value>()(m_impl.coeff(i)); + } + + for (size_t i = 0; i < m_fft.size(); ++i) { + Index dim = m_fft[i]; + eigen_assert(dim >= 0 && dim < NumDims); + Index line_len = m_dimensions[dim]; + eigen_assert(line_len >= 1); + ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len); + const bool is_power_of_two = isPowerOfTwo(line_len); + const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len); + const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite); + + ComplexScalar* a = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); + ComplexScalar* b = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); + ComplexScalar* pos_j_base_powered = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1)); + if (!is_power_of_two) { + // Compute twiddle factors + // t_n = exp(sqrt(-1) * pi * n^2 / line_len) + // for n = 0, 1,..., line_len-1. + // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 + pos_j_base_powered[0] = ComplexScalar(1, 0); + if (line_len > 1) { + const RealScalar pi_over_len(EIGEN_PI / line_len); + const ComplexScalar pos_j_base = ComplexScalar( + std::cos(pi_over_len), std::sin(pi_over_len)); + pos_j_base_powered[1] = pos_j_base; + if (line_len > 2) { + const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; + for (int j = 2; j < line_len + 1; ++j) { + pos_j_base_powered[j] = pos_j_base_powered[j - 1] * + pos_j_base_powered[j - 1] / + pos_j_base_powered[j - 2] * pos_j_base_sq; + } + } + } + } + + for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) { + const Index base_offset = getBaseOffsetFromIndex(partial_index, dim); + + // get data into line_buf + const Index stride = m_strides[dim]; + if (stride == 1) { + memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); + } else { + Index offset = base_offset; + for (int j = 0; j < line_len; ++j, offset += stride) { + line_buf[j] = buf[offset]; + } + } + + // processs the line + if (is_power_of_two) { + processDataLineCooleyTukey(line_buf, line_len, log_len); + } + else { + processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered); + } + + // write back + if (FFTDir == FFT_FORWARD && stride == 1) { + memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); + } else { + Index offset = base_offset; + const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); + for (int j = 0; j < line_len; ++j, offset += stride) { + buf[offset] = (FFTDir == FFT_FORWARD) ? line_buf[j] : line_buf[j] * div_factor; + } + } + } + m_device.deallocate(line_buf); + if (!is_power_of_two) { + m_device.deallocate(a); + m_device.deallocate(b); + m_device.deallocate(pos_j_base_powered); + } + } + + if(!write_to_out) { + for (Index i = 0; i < m_size; ++i) { + data[i] = PartOf()(buf[i]); + } + m_device.deallocate(buf); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) { + eigen_assert(x > 0); + return !(x & (x - 1)); + } + + // The composite number for padding, used in Bluestein's FFT algorithm + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) { + Index i = 2; + while (i < 2 * n - 1) i *= 2; + return i; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) { + Index log2m = 0; + while (m >>= 1) log2m++; + return log2m; + } + + // Call Cooley Tukey algorithm directly, data length must be power of 2 + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len, Index log_len) { + eigen_assert(isPowerOfTwo(line_len)); + scramble_FFT(line_buf, line_len); + compute_1D_Butterfly(line_buf, line_len, log_len); + } + + // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len, Index good_composite, Index log_len, ComplexScalar* a, ComplexScalar* b, const ComplexScalar* pos_j_base_powered) { + Index n = line_len; + Index m = good_composite; + ComplexScalar* data = line_buf; + + for (Index i = 0; i < n; ++i) { + if(FFTDir == FFT_FORWARD) { + a[i] = data[i] * numext::conj(pos_j_base_powered[i]); + } + else { + a[i] = data[i] * pos_j_base_powered[i]; + } + } + for (Index i = n; i < m; ++i) { + a[i] = ComplexScalar(0, 0); + } + + for (Index i = 0; i < n; ++i) { + if(FFTDir == FFT_FORWARD) { + b[i] = pos_j_base_powered[i]; + } + else { + b[i] = numext::conj(pos_j_base_powered[i]); + } + } + for (Index i = n; i < m - n; ++i) { + b[i] = ComplexScalar(0, 0); + } + for (Index i = m - n; i < m; ++i) { + if(FFTDir == FFT_FORWARD) { + b[i] = pos_j_base_powered[m-i]; + } + else { + b[i] = numext::conj(pos_j_base_powered[m-i]); + } + } + + scramble_FFT(a, m); + compute_1D_Butterfly(a, m, log_len); + + scramble_FFT(b, m); + compute_1D_Butterfly(b, m, log_len); + + for (Index i = 0; i < m; ++i) { + a[i] *= b[i]; + } + + scramble_FFT(a, m); + compute_1D_Butterfly(a, m, log_len); + + //Do the scaling after ifft + for (Index i = 0; i < m; ++i) { + a[i] /= m; + } + + for (Index i = 0; i < n; ++i) { + if(FFTDir == FFT_FORWARD) { + data[i] = a[i] * numext::conj(pos_j_base_powered[i]); + } + else { + data[i] = a[i] * pos_j_base_powered[i]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) { + eigen_assert(isPowerOfTwo(n)); + Index j = 1; + for (Index i = 1; i < n; ++i){ + if (j > i) { + std::swap(data[j-1], data[i-1]); + } + Index m = n >> 1; + while (m >= 2 && j > m) { + j -= m; + m >>= 1; + } + j += m; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_2(ComplexScalar* data) { + ComplexScalar tmp = data[1]; + data[1] = data[0] - data[1]; + data[0] += tmp; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_4(ComplexScalar* data) { + ComplexScalar tmp[4]; + tmp[0] = data[0] + data[1]; + tmp[1] = data[0] - data[1]; + tmp[2] = data[2] + data[3]; + if (Dir == FFT_FORWARD) { + tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]); + } else { + tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]); + } + data[0] = tmp[0] + tmp[2]; + data[1] = tmp[1] + tmp[3]; + data[2] = tmp[0] - tmp[2]; + data[3] = tmp[1] - tmp[3]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_8(ComplexScalar* data) { + ComplexScalar tmp_1[8]; + ComplexScalar tmp_2[8]; + + tmp_1[0] = data[0] + data[1]; + tmp_1[1] = data[0] - data[1]; + tmp_1[2] = data[2] + data[3]; + if (Dir == FFT_FORWARD) { + tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1); + } else { + tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1); + } + tmp_1[4] = data[4] + data[5]; + tmp_1[5] = data[4] - data[5]; + tmp_1[6] = data[6] + data[7]; + if (Dir == FFT_FORWARD) { + tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1); + } else { + tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1); + } + tmp_2[0] = tmp_1[0] + tmp_1[2]; + tmp_2[1] = tmp_1[1] + tmp_1[3]; + tmp_2[2] = tmp_1[0] - tmp_1[2]; + tmp_2[3] = tmp_1[1] - tmp_1[3]; + tmp_2[4] = tmp_1[4] + tmp_1[6]; +// SQRT2DIV2 = sqrt(2)/2 +#define SQRT2DIV2 0.7071067811865476 + if (Dir == FFT_FORWARD) { + tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2); + tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1); + tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2); + } else { + tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2); + tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1); + tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2); + } + data[0] = tmp_2[0] + tmp_2[4]; + data[1] = tmp_2[1] + tmp_2[5]; + data[2] = tmp_2[2] + tmp_2[6]; + data[3] = tmp_2[3] + tmp_2[7]; + data[4] = tmp_2[0] - tmp_2[4]; + data[5] = tmp_2[1] - tmp_2[5]; + data[6] = tmp_2[2] - tmp_2[6]; + data[7] = tmp_2[3] - tmp_2[7]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge( + ComplexScalar* data, Index n, Index n_power_of_2) { + // Original code: + // RealScalar wtemp = std::sin(M_PI/n); + // RealScalar wpi = -std::sin(2 * M_PI/n); + const RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2]; + const RealScalar wpi = (Dir == FFT_FORWARD) + ? m_minus_sin_2_PI_div_n_LUT[n_power_of_2] + : -m_minus_sin_2_PI_div_n_LUT[n_power_of_2]; + + const ComplexScalar wp(wtemp, wpi); + const ComplexScalar wp_one = wp + ComplexScalar(1, 0); + const ComplexScalar wp_one_2 = wp_one * wp_one; + const ComplexScalar wp_one_3 = wp_one_2 * wp_one; + const ComplexScalar wp_one_4 = wp_one_3 * wp_one; + const Index n2 = n / 2; + ComplexScalar w(1.0, 0.0); + for (Index i = 0; i < n2; i += 4) { + ComplexScalar temp0(data[i + n2] * w); + ComplexScalar temp1(data[i + 1 + n2] * w * wp_one); + ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2); + ComplexScalar temp3(data[i + 3 + n2] * w * wp_one_3); + w = w * wp_one_4; + + data[i + n2] = data[i] - temp0; + data[i] += temp0; + + data[i + 1 + n2] = data[i + 1] - temp1; + data[i + 1] += temp1; + + data[i + 2 + n2] = data[i + 2] - temp2; + data[i + 2] += temp2; + + data[i + 3 + n2] = data[i + 3] - temp3; + data[i + 3] += temp3; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly( + ComplexScalar* data, Index n, Index n_power_of_2) { + eigen_assert(isPowerOfTwo(n)); + if (n > 8) { + compute_1D_Butterfly(data, n / 2, n_power_of_2 - 1); + compute_1D_Butterfly(data + n / 2, n / 2, n_power_of_2 - 1); + butterfly_1D_merge(data, n, n_power_of_2); + } else if (n == 8) { + butterfly_8(data); + } else if (n == 4) { + butterfly_4(data); + } else if (n == 2) { + butterfly_2(data); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const { + Index result = 0; + + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > omitted_dim; --i) { + const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; + const Index idx = index / partial_m_stride; + index -= idx * partial_m_stride; + result += idx * m_strides[i]; + } + result += index; + } + else { + for (Index i = 0; i < omitted_dim; ++i) { + const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; + const Index idx = index / partial_m_stride; + index -= idx * partial_m_stride; + result += idx * m_strides[i]; + } + result += index; + } + // Value of index_coords[omitted_dim] is not determined to this step + return result; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const { + Index result = base + offset * m_strides[omitted_dim] ; + return result; + } + + protected: + Index m_size; + const FFT& m_fft; + Dimensions m_dimensions; + array m_strides; + TensorEvaluator m_impl; + CoeffReturnType* m_data; + const Device& m_device; + + // This will support a maximum FFT size of 2^32 for each dimension + // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; + const RealScalar m_sin_PI_div_n_LUT[32] = { + RealScalar(0.0), + RealScalar(-2), + RealScalar(-0.999999999999999), + RealScalar(-0.292893218813453), + RealScalar(-0.0761204674887130), + RealScalar(-0.0192147195967696), + RealScalar(-0.00481527332780311), + RealScalar(-0.00120454379482761), + RealScalar(-3.01181303795779e-04), + RealScalar(-7.52981608554592e-05), + RealScalar(-1.88247173988574e-05), + RealScalar(-4.70619042382852e-06), + RealScalar(-1.17654829809007e-06), + RealScalar(-2.94137117780840e-07), + RealScalar(-7.35342821488550e-08), + RealScalar(-1.83835707061916e-08), + RealScalar(-4.59589268710903e-09), + RealScalar(-1.14897317243732e-09), + RealScalar(-2.87243293150586e-10), + RealScalar( -7.18108232902250e-11), + RealScalar(-1.79527058227174e-11), + RealScalar(-4.48817645568941e-12), + RealScalar(-1.12204411392298e-12), + RealScalar(-2.80511028480785e-13), + RealScalar(-7.01277571201985e-14), + RealScalar(-1.75319392800498e-14), + RealScalar(-4.38298482001247e-15), + RealScalar(-1.09574620500312e-15), + RealScalar(-2.73936551250781e-16), + RealScalar(-6.84841378126949e-17), + RealScalar(-1.71210344531737e-17), + RealScalar(-4.28025861329343e-18) + }; + + // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i)); + const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = { + RealScalar(0.0), + RealScalar(0.0), + RealScalar(-1.00000000000000e+00), + RealScalar(-7.07106781186547e-01), + RealScalar(-3.82683432365090e-01), + RealScalar(-1.95090322016128e-01), + RealScalar(-9.80171403295606e-02), + RealScalar(-4.90676743274180e-02), + RealScalar(-2.45412285229123e-02), + RealScalar(-1.22715382857199e-02), + RealScalar(-6.13588464915448e-03), + RealScalar(-3.06795676296598e-03), + RealScalar(-1.53398018628477e-03), + RealScalar(-7.66990318742704e-04), + RealScalar(-3.83495187571396e-04), + RealScalar(-1.91747597310703e-04), + RealScalar(-9.58737990959773e-05), + RealScalar(-4.79368996030669e-05), + RealScalar(-2.39684498084182e-05), + RealScalar(-1.19842249050697e-05), + RealScalar(-5.99211245264243e-06), + RealScalar(-2.99605622633466e-06), + RealScalar(-1.49802811316901e-06), + RealScalar(-7.49014056584716e-07), + RealScalar(-3.74507028292384e-07), + RealScalar(-1.87253514146195e-07), + RealScalar(-9.36267570730981e-08), + RealScalar(-4.68133785365491e-08), + RealScalar(-2.34066892682746e-08), + RealScalar(-1.17033446341373e-08), + RealScalar(-5.85167231706864e-09), + RealScalar(-2.92583615853432e-09) + }; +}; + +} // end namespace Eigen + +#endif // EIGEN_HAS_CONSTEXPR + + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FFT_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h new file mode 100644 index 00000000..5a41a867 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -0,0 +1,389 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H +#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H + +namespace Eigen { + +/** \class TensorFixedSize + * \ingroup CXX11_Tensor_Module + * + * \brief The fixed sized version of the tensor class. + * + * The fixed sized equivalent of + * Eigen::Tensor t(3, 5, 7); + * is + * Eigen::TensorFixedSize> t; + */ + +template +class TensorFixedSize : public TensorBase > +{ + public: + typedef TensorFixedSize Self; + typedef TensorBase > Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef Scalar_ Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + static const int Options = Options_; + + enum { + IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0), + Layout = Options_ & RowMajor ? RowMajor : ColMajor, + CoordAccess = true, + RawAccess = true + }; + + typedef Dimensions_ Dimensions; + static const std::size_t NumIndices = Dimensions::count; + + protected: + TensorStorage m_storage; + + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + + // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // work, because that uses base().coeffRef() - and we don't yet + // implement a similar class hierarchy + inline Self& base() { return *this; } + inline const Self& base() const { return *this; } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array{{firstIndex, otherIndices...}}); + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + { + if (Options&RowMajor) { + const Index index = i1 + i0 * m_storage.dimensions()[1]; + return m_storage.data()[index]; + } else { + const Index index = i0 + i1 * m_storage.dimensions()[0]; + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + { + if (Options&RowMajor) { + const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2); + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + { + if (Options&RowMajor) { + const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3)); + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + if (Options&RowMajor) { + const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0))); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4))); + return m_storage.data()[index]; + } + } +#endif + + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const + { + eigen_assert(checkIndexRange(indices)); + return coeff(indices); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return coeff(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const + { + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(index); + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array{{firstIndex, otherIndices...}}); + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + { + if (Options&RowMajor) { + const Index index = i1 + i0 * m_storage.dimensions()[1]; + return m_storage.data()[index]; + } else { + const Index index = i0 + i1 * m_storage.dimensions()[0]; + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + { + if (Options&RowMajor) { + const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2); + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + if (Options&RowMajor) { + const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3)); + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + if (Options&RowMajor) { + const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0))); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4))); + return m_storage.data()[index]; + } + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + { + eigen_assert(checkIndexRange(indices)); + return coeffRef(indices); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeffRef(); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator[](Index index) + { + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize() + : m_storage() + { + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize(const Self& other) + : m_storage(other.m_storage) + { + } + +#if EIGEN_HAS_RVALUE_REFERENCES + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other) + : m_storage(other.m_storage) + { + } +#endif + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize& operator=(const TensorFixedSize& other) + { + // FIXME: check that the dimensions of other match the dimensions of *this. + // Unfortunately this isn't possible yet when the rhs is an expression. + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other) + { + // FIXME: check that the dimensions of other match the dimensions of *this. + // Unfortunately this isn't possible yet when the rhs is an expression. + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE bool checkIndexRange(const array& /*indices*/) const + { + using internal::array_apply_and_reduce; + using internal::array_zip_and_reduce; + using internal::greater_equal_zero_op; + using internal::logical_and_op; + using internal::lesser_op; + + return true; + // check whether the indices are all >= 0 + /* array_apply_and_reduce(indices) && + // check whether the indices fit in the dimensions + array_zip_and_reduce(indices, m_storage.dimensions());*/ + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const + { + if (Options&RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h new file mode 100644 index 00000000..53b915da --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -0,0 +1,169 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H + +namespace Eigen { + +namespace internal { +template class MakePointer_> +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0 + }; + template struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + }; +}; + +template class MakePointer_> +struct eval, Eigen::Dense> +{ + typedef const TensorForcedEvalOp& type; +}; + +template class MakePointer_> +struct nested, 1, typename eval >::type> +{ + typedef TensorForcedEvalOp type; +}; + +} // end namespace internal + + + +// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_) + +/** \class TensorForcedEvalOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +/// `template class MakePointer_` is added to convert the host pointer to the device pointer. +/// It is added due to the fact that for our device compiler `T*` is not allowed. +/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`. +/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_` is `T*` . +/// Therefore, by adding the default value, we managed to convert the type and it does not break any +/// existing code as its default value is `T*`. +template class MakePointer_> +class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + + +template class MakePointer_> +struct TensorEvaluator, Device> +{ + typedef TensorForcedEvalOp XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = true, + PacketAccess = (PacketSize > 1), + Layout = TensorEvaluator::Layout, + RawAccess = true + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + /// op_ is used for sycl + : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) + { } + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + const Index numValues = internal::array_prod(m_impl.dimensions()); + m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); + // Should initialize the memory in case we're dealing with non POD types. + if (NumTraits::RequireInitialization) { + for (Index i = 0; i < numValues; ++i) { + new(m_buffer+i) CoeffReturnType(); + } + } + typedef TensorEvalToOp< const typename internal::remove_const::type > EvalTo; + EvalTo evalToTmp(m_buffer, m_op); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::type, PacketAccess>::run(evalToTmp, m_device); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_device.deallocate(m_buffer); + m_buffer = NULL; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_buffer[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return internal::ploadt(m_buffer + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC typename MakePointer::Type data() const { return m_buffer; } + + /// required by sycl in order to extract the sycl accessor + const TensorEvaluator& impl() { return m_impl; } + /// used by sycl in order to build the sycl buffer + const Device& device() const{return m_device;} + private: + TensorEvaluator m_impl; + const ArgType m_op; + const Device& m_device; + typename MakePointer::Type m_buffer; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h new file mode 100644 index 00000000..6e76b5c5 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -0,0 +1,109 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H + +namespace Eigen { + +// MakePointer class is used as a container of the adress space of the pointer +// on the host and on the device. From the host side it generates the T* pointer +// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to +// T* m_data on the host. It is always called on the device. +// Specialisation of MakePointer class for creating the sycl buffer with +// map_allocator. +template struct MakePointer { + typedef T* Type; +}; + +template class MakePointer_ = MakePointer> class TensorMap; +template class Tensor; +template class TensorFixedSize; +template class TensorRef; +template class TensorBase; + +template class TensorCwiseNullaryOp; +template class TensorCwiseUnaryOp; +template class TensorCwiseBinaryOp; +template class TensorCwiseTernaryOp; +template class TensorSelectOp; +template class MakePointer_ = MakePointer > class TensorReductionOp; +template class TensorIndexTupleOp; +template class TensorTupleReducerOp; +template class TensorConcatenationOp; +template class TensorContractionOp; +template class TensorConversionOp; +template class TensorConvolutionOp; +template class TensorFFTOp; +template class TensorPatchOp; +template class TensorImagePatchOp; +template class TensorVolumePatchOp; +template class TensorBroadcastingOp; +template class TensorChippingOp; +template class TensorReshapingOp; +template class TensorLayoutSwapOp; +template class TensorSlicingOp; +template class TensorReverseOp; +template class TensorPaddingOp; +template class TensorShufflingOp; +template class TensorStridingOp; +template class TensorStridingSlicingOp; +template class TensorInflationOp; +template class TensorGeneratorOp; +template class TensorAssignOp; +template class TensorScanOp; + +template class TensorCustomUnaryOp; +template class TensorCustomBinaryOp; + +template class MakePointer_ = MakePointer> class TensorEvalToOp; +template class MakePointer_ = MakePointer> class TensorForcedEvalOp; + +template class TensorDevice; +template struct TensorEvaluator; + +struct DefaultDevice; +struct ThreadPoolDevice; +struct GpuDevice; +struct SyclDevice; + +enum FFTResultType { + RealPart = 0, + ImagPart = 1, + BothParts = 2 +}; + +enum FFTDirection { + FFT_FORWARD = 0, + FFT_REVERSE = 1 +}; + + +namespace internal { + +template +struct IsVectorizable { + static const bool value = TensorEvaluator::PacketAccess; +}; + +template +struct IsVectorizable { + static const bool value = TensorEvaluator::PacketAccess && + TensorEvaluator::IsAligned; +}; + +template ::value> +class TensorExecutor; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h new file mode 100644 index 00000000..4ed080d3 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -0,0 +1,489 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H + +namespace Eigen { +namespace internal { + + +/** \internal + * \brief Template functor to compute the modulo between an array and a scalar. + */ +template +struct scalar_mod_op { + EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {} + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; } + const Scalar m_divisor; +}; +template +struct functor_traits > +{ enum { Cost = scalar_div_cost::value, PacketAccess = false }; }; + + +/** \internal + * \brief Template functor to compute the modulo between 2 arrays. + */ +template +struct scalar_mod2_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op); + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } +}; +template +struct functor_traits > +{ enum { Cost = scalar_div_cost::value, PacketAccess = false }; }; + +template +struct scalar_fmod_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar + operator()(const Scalar& a, const Scalar& b) const { + return numext::fmod(a, b); + } +}; +template +struct functor_traits > { + enum { Cost = 13, // Reciprocal throughput of FPREM on Haswell. + PacketAccess = false }; +}; + + +/** \internal + * \brief Template functor to compute the sigmoid of a scalar + * \sa class CwiseUnaryOp, ArrayBase::sigmoid() + */ +template +struct scalar_sigmoid_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { + const T one = T(1); + return one / (one + numext::exp(-x)); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(const Packet& x) const { + const Packet one = pset1(T(1)); + return pdiv(one, padd(one, pexp(pnegate(x)))); + } +}; + +template +struct functor_traits > { + enum { + Cost = NumTraits::AddCost * 2 + NumTraits::MulCost * 6, + PacketAccess = packet_traits::HasAdd && packet_traits::HasDiv && + packet_traits::HasNegate && packet_traits::HasExp + }; +}; + + +template +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false + }; +}; + +// Standard reduction functors +template struct SumReducer +{ + static const bool PacketAccess = packet_traits::HasAdd; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + internal::scalar_sum_op sum_op; + *accum = sum_op(*accum, t); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = padd(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + internal::scalar_cast_op conv; + return conv(0); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + internal::scalar_sum_op sum_op; + return sum_op(saccum, predux(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasAdd + }; +}; + + +template struct MeanReducer +{ + static const bool PacketAccess = packet_traits::HasAdd && !NumTraits::IsInteger; + static const bool IsStateful = true; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + MeanReducer() : scalarCount_(0), packetCount_(0) { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { + internal::scalar_sum_op sum_op; + *accum = sum_op(*accum, t); + scalarCount_++; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) { + (*accum) = padd(*accum, p); + packetCount_++; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + internal::scalar_cast_op conv; + return conv(0); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum / scalarCount_; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return pdiv(vaccum, pset1(packetCount_)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + internal::scalar_sum_op sum_op; + return sum_op(saccum, predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits::size); + } + + protected: + DenseIndex scalarCount_; + DenseIndex packetCount_; +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasAdd + }; +}; + + +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { + return Eigen::NumTraits::lowest(); + } +}; +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { + return -Eigen::NumTraits::infinity(); + } +}; +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { + return Eigen::NumTraits::highest(); + } +}; +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { + return Eigen::NumTraits::infinity(); + } +}; + + +template struct MaxReducer +{ + static const bool PacketAccess = packet_traits::HasMax; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t > *accum) { *accum = t; } + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmax(*accum, p); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return MinMaxBottomValue::IsInteger>::bottom_value(); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return numext::maxi(saccum, predux_max(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasMax + }; +}; + + +template struct MinReducer +{ + static const bool PacketAccess = packet_traits::HasMin; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t < *accum) { *accum = t; } + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmin(*accum, p); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return MinMaxBottomValue::IsInteger>::bottom_value(); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return numext::mini(saccum, predux_min(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasMin + }; +}; + + +template struct ProdReducer +{ + static const bool PacketAccess = packet_traits::HasMul; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + internal::scalar_product_op prod_op; + (*accum) = prod_op(*accum, t); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmul(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + internal::scalar_cast_op conv; + return conv(1); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + internal::scalar_product_op prod_op; + return prod_op(saccum, predux_mul(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::MulCost, + PacketAccess = PacketType::HasMul + }; +}; + + +struct AndReducer +{ + static const bool PacketAccess = false; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { + *accum = *accum && t; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { + return accum; + } +}; + +template +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false + }; +}; + + +struct OrReducer { + static const bool PacketAccess = false; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { + *accum = *accum || t; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { + return false; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { + return accum; + } +}; + +template +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false + }; +}; + + +// Argmin/Argmax reducers +template struct ArgMaxTupleReducer +{ + static const bool PacketAccess = false; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t.second > accum->second) { *accum = t; } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return T(0, NumTraits::lowest()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { + return accum; + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = false + }; +}; + + +template struct ArgMinTupleReducer +{ + static const bool PacketAccess = false; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const { + if (t.second < accum->second) { *accum = t; } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return T(0, NumTraits::highest()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { + return accum; + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = false + }; +}; + + +template +class GaussianGenerator { + public: + static const bool PacketAccess = false; + + EIGEN_DEVICE_FUNC GaussianGenerator(const array& means, + const array& std_devs) + : m_means(means) + { + for (size_t i = 0; i < NumDims; ++i) { + m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2; + } + } + + EIGEN_DEVICE_FUNC T operator()(const array& coordinates) const { + T tmp = T(0); + for (size_t i = 0; i < NumDims; ++i) { + T offset = coordinates[i] - m_means[i]; + tmp += offset * offset / m_two_sigmas[i]; + } + return numext::exp(-tmp); + } + + private: + array m_means; + array m_two_sigmas; +}; + +template +struct functor_traits > { + enum { + Cost = NumDims * (2 * NumTraits::AddCost + NumTraits::MulCost + + functor_traits >::Cost) + + functor_traits >::Cost, + PacketAccess = GaussianGenerator::PacketAccess + }; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h new file mode 100644 index 00000000..008d113d --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -0,0 +1,185 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H + +namespace Eigen { + +/** \class TensorGeneratorOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor generator class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorGeneratorOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorGeneratorOp type; +}; + +} // end namespace internal + + + +template +class TensorGeneratorOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator) + : m_xpr(expr), m_generator(generator) {} + + EIGEN_DEVICE_FUNC + const Generator& generator() const { return m_generator; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Generator m_generator; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorGeneratorOp XprType; + typedef typename XprType::Index Index; + typedef typename TensorEvaluator::Dimensions Dimensions; + static const int NumDims = internal::array_size::value; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + enum { + IsAligned = false, + PacketAccess = (internal::unpacket_traits::size > 1), + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_generator(op.generator()) + { + TensorEvaluator impl(op.expression(), device); + m_dimensions = impl.dimensions(); + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + array coords; + extract_coordinates(index, coords); + return m_generator(coords); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool) const { + // TODO(rmlarsen): This is just a placeholder. Define interface to make + // generators return their cost. + return TensorOpCost(0, 0, TensorOpCost::AddCost() + + TensorOpCost::MulCost()); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void extract_coordinates(Index index, array& coords) const { + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_strides[i]; + index -= idx * m_strides[i]; + coords[i] = idx; + } + coords[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_strides[i]; + index -= idx * m_strides[i]; + coords[i] = idx; + } + coords[NumDims-1] = index; + } + } + + Dimensions m_dimensions; + array m_strides; + Generator m_generator; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h new file mode 100644 index 00000000..403d6904 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h @@ -0,0 +1,33 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Eugene Brevdo +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H + +namespace Eigen { + +/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors. + * + * This function computes the regularized incomplete beta function (integral). + * + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const + TensorCwiseTernaryOp, + const ADerived, const BDerived, const XDerived> + betainc(const ADerived& a, const BDerived& b, const XDerived& x) { + return TensorCwiseTernaryOp< + internal::scalar_betainc_op, const ADerived, + const BDerived, const XDerived>( + a, b, x, internal::scalar_betainc_op()); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h new file mode 100644 index 00000000..d3ae8bcb --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h @@ -0,0 +1,79 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H +#define EIGEN_CXX11_TENSOR_TENSOR_IO_H + +namespace Eigen { + +namespace internal { + +// Print the tensor as a 2d matrix +template +struct TensorPrinter { + static void run (std::ostream& os, const Tensor& tensor) { + typedef typename internal::remove_const::type Scalar; + typedef typename Tensor::Index Index; + const Index total_size = internal::array_prod(tensor.dimensions()); + if (total_size > 0) { + const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions()); + static const int layout = Tensor::Layout; + Map > matrix(const_cast(tensor.data()), first_dim, total_size/first_dim); + os << matrix; + } + } +}; + + +// Print the tensor as a vector +template +struct TensorPrinter { + static void run (std::ostream& os, const Tensor& tensor) { + typedef typename internal::remove_const::type Scalar; + typedef typename Tensor::Index Index; + const Index total_size = internal::array_prod(tensor.dimensions()); + if (total_size > 0) { + Map > array(const_cast(tensor.data()), total_size); + os << array; + } + } +}; + + +// Print the tensor as a scalar +template +struct TensorPrinter { + static void run (std::ostream& os, const Tensor& tensor) { + os << tensor.coeff(0); + } +}; +} + +template +std::ostream& operator << (std::ostream& os, const TensorBase& expr) { + typedef TensorEvaluator, DefaultDevice> Evaluator; + typedef typename Evaluator::Dimensions Dimensions; + + // Evaluate the expression if needed + TensorForcedEvalOp eval = expr.eval(); + Evaluator tensor(eval, DefaultDevice()); + tensor.evalSubExprsIfNeeded(NULL); + + // Print the result + static const int rank = internal::array_size::value; + internal::TensorPrinter::run(os, tensor); + + // Cleanup. + tensor.cleanup(); + return os; +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h new file mode 100644 index 00000000..7e25f4f2 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -0,0 +1,509 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H + +namespace Eigen { + +/** \class TensorImagePatch + * \ingroup CXX11_Tensor_Module + * + * \brief Patch extraction specialized for image processing. + * This assumes that the input has a least 3 dimensions ordered as follow: + * 1st dimension: channels (of size d) + * 2nd dimension: rows (of size r) + * 3rd dimension: columns (of size c) + * There can be additional dimensions such as time (for video) or batch (for + * bulk processing after the first 3. + * Calling the image patch code with patch_rows and patch_cols is equivalent + * to calling the regular patch extraction code with parameters d, patch_rows, + * patch_cols, and 1 for all the additional dimensions. + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename internal::remove_const::type Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorImagePatchOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorImagePatchOp type; +}; + +} // end namespace internal + +template +class TensorImagePatchOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex row_strides, DenseIndex col_strides, + DenseIndex in_row_strides, DenseIndex in_col_strides, + DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, + PaddingType padding_type, Scalar padding_value) + : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_row_strides(row_strides), m_col_strides(col_strides), + m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0), + m_padding_type(padding_type), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex row_strides, DenseIndex col_strides, + DenseIndex in_row_strides, DenseIndex in_col_strides, + DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, + DenseIndex padding_top, DenseIndex padding_bottom, + DenseIndex padding_left, DenseIndex padding_right, + Scalar padding_value) + : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_row_strides(row_strides), m_col_strides(col_strides), + m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom), + m_padding_left(padding_left), m_padding_right(padding_right), + m_padding_type(PADDING_VALID), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC + DenseIndex patch_rows() const { return m_patch_rows; } + EIGEN_DEVICE_FUNC + DenseIndex patch_cols() const { return m_patch_cols; } + EIGEN_DEVICE_FUNC + DenseIndex row_strides() const { return m_row_strides; } + EIGEN_DEVICE_FUNC + DenseIndex col_strides() const { return m_col_strides; } + EIGEN_DEVICE_FUNC + DenseIndex in_row_strides() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC + DenseIndex in_col_strides() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC + DenseIndex row_inflate_strides() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC + DenseIndex col_inflate_strides() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC + bool padding_explicit() const { return m_padding_explicit; } + EIGEN_DEVICE_FUNC + DenseIndex padding_top() const { return m_padding_top; } + EIGEN_DEVICE_FUNC + DenseIndex padding_bottom() const { return m_padding_bottom; } + EIGEN_DEVICE_FUNC + DenseIndex padding_left() const { return m_padding_left; } + EIGEN_DEVICE_FUNC + DenseIndex padding_right() const { return m_padding_right; } + EIGEN_DEVICE_FUNC + PaddingType padding_type() const { return m_padding_type; } + EIGEN_DEVICE_FUNC + Scalar padding_value() const { return m_padding_value; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const DenseIndex m_patch_rows; + const DenseIndex m_patch_cols; + const DenseIndex m_row_strides; + const DenseIndex m_col_strides; + const DenseIndex m_in_row_strides; + const DenseIndex m_in_col_strides; + const DenseIndex m_row_inflate_strides; + const DenseIndex m_col_inflate_strides; + const bool m_padding_explicit; + const DenseIndex m_padding_top; + const DenseIndex m_padding_bottom; + const DenseIndex m_padding_left; + const DenseIndex m_padding_right; + const PaddingType m_padding_type; + const Scalar m_padding_value; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorImagePatchOp XprType; + typedef typename XprType::Index Index; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumDims = NumInputDims + 1; + typedef DSizes Dimensions; + typedef typename internal::remove_const::type Scalar; + typedef TensorEvaluator, + Device> Self; + typedef TensorEvaluator Impl; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); + + m_paddingValue = op.padding_value(); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + + // Caches a few variables. + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputDepth = input_dims[0]; + m_inputRows = input_dims[1]; + m_inputCols = input_dims[2]; + } else { + m_inputDepth = input_dims[NumInputDims-1]; + m_inputRows = input_dims[NumInputDims-2]; + m_inputCols = input_dims[NumInputDims-3]; + } + + m_row_strides = op.row_strides(); + m_col_strides = op.col_strides(); + + // Input strides and effective input/patch size + m_in_row_strides = op.in_row_strides(); + m_in_col_strides = op.in_col_strides(); + m_row_inflate_strides = op.row_inflate_strides(); + m_col_inflate_strides = op.col_inflate_strides(); + // The "effective" input rows and input cols are the input rows and cols + // after inflating them with zeros. + // For examples, a 2x3 matrix with row_inflate_strides and + // col_inflate_strides of 2 comes from: + // A B C + // D E F + // + // to a matrix is 3 x 5: + // + // A . B . C + // . . . . . + // D . E . F + + m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1; + m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1; + m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1); + m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1); + + if (op.padding_explicit()) { + m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); + m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); + m_rowPaddingTop = op.padding_top(); + m_colPaddingLeft = op.padding_left(); + } else { + // Computing padding from the type + switch (op.padding_type()) { + case PADDING_VALID: + m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); + m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = numext::maxi(0, ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2); + m_colPaddingLeft = numext::maxi(0, ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2); + break; + case PADDING_SAME: + m_outputRows = numext::ceil(m_input_rows_eff / static_cast(m_row_strides)); + m_outputCols = numext::ceil(m_input_cols_eff / static_cast(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2; + m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2; + break; + default: + eigen_assert(false && "unexpected padding"); + } + } + eigen_assert(m_outputRows > 0); + eigen_assert(m_outputCols > 0); + + // Dimensions for result of extraction. + if (static_cast(Layout) == static_cast(ColMajor)) { + // ColMajor + // 0: depth + // 1: patch_rows + // 2: patch_cols + // 3: number of patches + // 4 and beyond: anything else (such as batch). + m_dimensions[0] = input_dims[0]; + m_dimensions[1] = op.patch_rows(); + m_dimensions[2] = op.patch_cols(); + m_dimensions[3] = m_outputRows * m_outputCols; + for (int i = 4; i < NumDims; ++i) { + m_dimensions[i] = input_dims[i-1]; + } + } else { + // RowMajor + // NumDims-1: depth + // NumDims-2: patch_rows + // NumDims-3: patch_cols + // NumDims-4: number of patches + // NumDims-5 and beyond: anything else (such as batch). + m_dimensions[NumDims-1] = input_dims[NumInputDims-1]; + m_dimensions[NumDims-2] = op.patch_rows(); + m_dimensions[NumDims-3] = op.patch_cols(); + m_dimensions[NumDims-4] = m_outputRows * m_outputCols; + for (int i = NumDims-5; i >= 0; --i) { + m_dimensions[i] = input_dims[i]; + } + } + + // Strides for moving the patch in various dimensions. + if (static_cast(Layout) == static_cast(ColMajor)) { + m_colStride = m_dimensions[1]; + m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0]; + m_otherStride = m_patchStride * m_dimensions[3]; + } else { + m_colStride = m_dimensions[NumDims-2]; + m_patchStride = m_colStride * m_dimensions[NumDims-3] * m_dimensions[NumDims-1]; + m_otherStride = m_patchStride * m_dimensions[NumDims-4]; + } + + // Strides for navigating through the input tensor. + m_rowInputStride = m_inputDepth; + m_colInputStride = m_inputDepth * m_inputRows; + m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols; + + // Fast representations of different variables. + m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); + m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); + m_fastColStride = internal::TensorIntDivisor(m_colStride); + m_fastInflateRowStride = internal::TensorIntDivisor(m_row_inflate_strides); + m_fastInflateColStride = internal::TensorIntDivisor(m_col_inflate_strides); + m_fastInputColsEff = internal::TensorIntDivisor(m_input_cols_eff); + + // Number of patches in the width dimension. + m_fastOutputRows = internal::TensorIntDivisor(m_outputRows); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[0]); + } else { + m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims-1]); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Patch index corresponding to the passed in index. + const Index patchIndex = index / m_fastPatchStride; + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth; + + // Other ways to index this element. + const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride; + const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; + + // Calculate col index in the input original tensor. + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; + const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0); + if (inputCol < 0 || inputCol >= m_input_cols_eff || + ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { + return Scalar(m_paddingValue); + } + + // Calculate row index in the original input tensor. + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; + const Index rowOffset = patchOffset - colOffset * m_colStride; + const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; + const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0); + if (inputRow < 0 || inputRow >= m_input_rows_eff || + ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { + return Scalar(m_paddingValue); + } + + const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + + const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) { + return packetWithPossibleZero(index); + } + + const Index indices[2] = {index, index + PacketSize - 1}; + const Index patchIndex = indices[0] / m_fastPatchStride; + if (patchIndex != indices[1] / m_fastPatchStride) { + return packetWithPossibleZero(index); + } + const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride; + eigen_assert(otherIndex == indices[1] / m_fastOtherStride); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth, + (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth}; + + const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; + eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); + + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; + + // Calculate col indices in the original input tensor. + const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - + m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; + if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputCols[0] == inputCols[1]) { + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; + const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + // Calculate col indices in the original input tensor. + const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - + m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; + + if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) { + // no padding + const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.template packet(inputIndex); + } + } + + return packetWithPossibleZero(index); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + const TensorEvaluator& impl() const { return m_impl; } + + Index rowPaddingTop() const { return m_rowPaddingTop; } + Index colPaddingLeft() const { return m_colPaddingLeft; } + Index outputRows() const { return m_outputRows; } + Index outputCols() const { return m_outputCols; } + Index userRowStride() const { return m_row_strides; } + Index userColStride() const { return m_col_strides; } + Index userInRowStride() const { return m_in_row_strides; } + Index userInColStride() const { return m_in_col_strides; } + Index rowInflateStride() const { return m_row_inflate_strides; } + Index colInflateStride() const { return m_col_inflate_strides; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // We conservatively estimate the cost for the code path where the computed + // index is inside the original image and + // TensorEvaluator::CoordAccess is false. + const double compute_cost = 3 * TensorOpCost::DivCost() + + 6 * TensorOpCost::MulCost() + + 8 * TensorOpCost::MulCost(); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const + { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Dimensions m_dimensions; + + Index m_otherStride; + Index m_patchStride; + Index m_colStride; + Index m_row_strides; + Index m_col_strides; + + Index m_in_row_strides; + Index m_in_col_strides; + Index m_row_inflate_strides; + Index m_col_inflate_strides; + + Index m_input_rows_eff; + Index m_input_cols_eff; + Index m_patch_rows_eff; + Index m_patch_cols_eff; + + internal::TensorIntDivisor m_fastOtherStride; + internal::TensorIntDivisor m_fastPatchStride; + internal::TensorIntDivisor m_fastColStride; + internal::TensorIntDivisor m_fastInflateRowStride; + internal::TensorIntDivisor m_fastInflateColStride; + internal::TensorIntDivisor m_fastInputColsEff; + + Index m_rowInputStride; + Index m_colInputStride; + Index m_patchInputStride; + + Index m_inputDepth; + Index m_inputRows; + Index m_inputCols; + + Index m_outputRows; + Index m_outputCols; + + Index m_rowPaddingTop; + Index m_colPaddingLeft; + + internal::TensorIntDivisor m_fastOutputRows; + internal::TensorIntDivisor m_fastOutputDepth; + + Scalar m_paddingValue; + + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h new file mode 100644 index 00000000..2e91471c --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -0,0 +1,725 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H +#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H + + +#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES + +#define EIGEN_HAS_INDEX_LIST + +namespace Eigen { + +/** \internal + * + * \class TensorIndexList + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode a set of Tensor dimensions/indices. + * + * The indices in the list can be known at compile time or at runtime. A mix + * of static and dynamic indices can also be provided if needed. The tensor + * code will attempt to take advantage of the indices that are known at + * compile time to optimize the code it generates. + * + * This functionality requires a c++11 compliant compiler. If your compiler + * is older you need to use arrays of indices instead. + * + * Several examples are provided in the cxx11_tensor_index_list.cpp file. + * + * \sa Tensor + */ + +template +struct type2index { + static const DenseIndex value = n; + EIGEN_DEVICE_FUNC constexpr operator DenseIndex() const { return n; } + EIGEN_DEVICE_FUNC void set(DenseIndex val) { + eigen_assert(val == n); + } +}; + +// This can be used with IndexPairList to get compile-time constant pairs, +// such as IndexPairList, type2indexpair<3,4>>(). +template +struct type2indexpair { + static const DenseIndex first = f; + static const DenseIndex second = s; + + constexpr EIGEN_DEVICE_FUNC operator IndexPair() const { + return IndexPair(f, s); + } + + EIGEN_DEVICE_FUNC void set(const IndexPair& val) { + eigen_assert(val.first == f); + eigen_assert(val.second == s); + } +}; + + +template struct NumTraits > +{ + typedef DenseIndex Real; + enum { + IsComplex = 0, + RequireInitialization = false, + ReadCost = 1, + AddCost = 1, + MulCost = 1 + }; + + EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; } + EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; } + EIGEN_DEVICE_FUNC static inline Real highest() { return n; } + EIGEN_DEVICE_FUNC static inline Real lowest() { return n; } +}; + +namespace internal { +template +EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) { + val = new_val; +} +template +EIGEN_DEVICE_FUNC void update_value(type2index& val, DenseIndex new_val) { + val.set(new_val); +} + +template +EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair new_val) { + val = new_val; +} +template +EIGEN_DEVICE_FUNC void update_value(type2indexpair& val, IndexPair new_val) { + val.set(new_val); +} + + +template +struct is_compile_time_constant { + static constexpr bool value = false; +}; + +template +struct is_compile_time_constant > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant& > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant& > { + static constexpr bool value = true; +}; + +template +struct is_compile_time_constant > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant& > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant& > { + static constexpr bool value = true; +}; + + +template +struct IndexTuple; + +template +struct IndexTuple { + EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() { } + EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) { } + + constexpr static int count = 1 + sizeof...(O); + T head; + IndexTuple others; + typedef T Head; + typedef IndexTuple Other; +}; + +template + struct IndexTuple { + EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() { } + EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) { } + + constexpr static int count = 1; + T head; + typedef T Head; +}; + + +template +struct IndexTupleExtractor; + +template +struct IndexTupleExtractor { + + typedef typename IndexTupleExtractor::ValType ValType; + + EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple& val) { + return IndexTupleExtractor::get_val(val.others); + } + + EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple& val) { + return IndexTupleExtractor::get_val(val.others); + } + template + EIGEN_DEVICE_FUNC static void set_val(IndexTuple& val, V& new_val) { + IndexTupleExtractor::set_val(val.others, new_val); + } + +}; + +template + struct IndexTupleExtractor<0, T, O...> { + + typedef T ValType; + + EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple& val) { + return val.head; + } + EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple& val) { + return val.head; + } + template + EIGEN_DEVICE_FUNC static void set_val(IndexTuple& val, V& new_val) { + val.head = new_val; + } +}; + + + +template +EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor::ValType& array_get(IndexTuple& tuple) { + return IndexTupleExtractor::get_val(tuple); +} +template +EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor::ValType& array_get(const IndexTuple& tuple) { + return IndexTupleExtractor::get_val(tuple); +} +template + struct array_size > { + static const size_t value = IndexTuple::count; +}; +template + struct array_size > { + static const size_t value = IndexTuple::count; +}; + + + + +template +struct tuple_coeff { + template + EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple& t) { + // return array_get(t) * (i == Idx) + tuple_coeff::get(i, t) * (i != Idx); + return (i == Idx ? array_get(t) : tuple_coeff::get(i, t)); + } + template + EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const ValueT& value) { + if (i == Idx) { + update_value(array_get(t), value); + } else { + tuple_coeff::set(i, t, value); + } + } + + template + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple& t) { + return ((i == Idx) & is_compile_time_constant::ValType>::value) || + tuple_coeff::value_known_statically(i, t); + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple& t) { + return is_compile_time_constant::ValType>::value && + tuple_coeff::values_up_to_known_statically(t); + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple& t) { + return is_compile_time_constant::ValType>::value && + is_compile_time_constant::ValType>::value && + array_get(t) > array_get(t) && + tuple_coeff::values_up_to_statically_known_to_increase(t); + } +}; + +template +struct tuple_coeff<0, ValueT> { + template + EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple& t) { + // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr + return array_get<0>(t)/* * (i == 0)*/; + } + template + EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const ValueT value) { + eigen_assert (i == 0); + update_value(array_get<0>(t), value); + } + template + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple&) { + return is_compile_time_constant::ValType>::value & (i == 0); + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple&) { + return is_compile_time_constant::ValType>::value; + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple&) { + return true; + } +}; +} // namespace internal + + + +template +struct IndexList : internal::IndexTuple { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const { + return internal::tuple_coeff >::value-1, DenseIndex>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const { + return internal::tuple_coeff >::value-1, DenseIndex>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) { + return internal::tuple_coeff >::value-1, DenseIndex>::set(i, *this, value); + } + + EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } + EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple(first, other...) { } + EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple() { } + + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { + return internal::tuple_coeff >::value-1, DenseIndex>::value_known_statically(i, *this); + } + EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const { + return internal::tuple_coeff >::value-1, DenseIndex>::values_up_to_known_statically(*this); + } + + EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const { + return internal::tuple_coeff >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this); + } +}; + + +template +constexpr IndexList make_index_list(FirstType val1, OtherTypes... other_vals) { + return IndexList(val1, other_vals...); +} + + +template +struct IndexPairList : internal::IndexTuple { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair operator[] (const DenseIndex i) const { + return internal::tuple_coeff >::value-1, IndexPair>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair value) { + return internal::tuple_coeff>::value-1, IndexPair >::set(i, *this, value); + } + + EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } + EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple() { } + + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { + return internal::tuple_coeff >::value-1, DenseIndex>::value_known_statically(i, *this); + } +}; + +namespace internal { + +template size_t array_prod(const IndexList& sizes) { + size_t result = 1; + for (int i = 0; i < array_size >::value; ++i) { + result *= sizes[i]; + } + return result; +} + +template struct array_size > { + static const size_t value = array_size >::value; +}; +template struct array_size > { + static const size_t value = array_size >::value; +}; + +template struct array_size > { + static const size_t value = std::tuple_size >::value; +}; +template struct array_size > { + static const size_t value = std::tuple_size >::value; +}; + +template EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList& a) { + return IndexTupleExtractor::get_val(a); +} +template EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(const IndexList& a) { + return IndexTupleExtractor::get_val(a); +} + +template +struct index_known_statically_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { + return false; + } +}; + +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) { + return IndexList().value_known_statically(i); + } +}; + +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) { + return IndexList().value_known_statically(i); + } +}; + + +template +struct all_indices_known_statically_impl { + static constexpr bool run() { + return false; + } +}; + +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return IndexList().all_values_known_statically(); + } +}; + +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return IndexList().all_values_known_statically(); + } +}; + + +template +struct indices_statically_known_to_increase_impl { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return false; + } +}; + +template + struct indices_statically_known_to_increase_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return Eigen::IndexList().values_statically_known_to_increase(); + } +}; + +template + struct indices_statically_known_to_increase_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return Eigen::IndexList().values_statically_known_to_increase(); + } +}; + + +template +struct index_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) == value); + } +}; + +template +struct index_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) == value); + } +}; + + +template +struct index_statically_ne_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_statically_ne_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) != value); + } +}; + +template +struct index_statically_ne_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) != value); + } +}; + + +template +struct index_statically_gt_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_statically_gt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) > value); + } +}; + +template +struct index_statically_gt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) > value); + } +}; + + + +template +struct index_statically_lt_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_statically_lt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) < value); + } +}; + +template +struct index_statically_lt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) < value); + } +}; + + + +template +struct index_pair_first_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_pair_first_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexPairList().value_known_statically(i) & + (IndexPairList().operator[](i).first == value); + } +}; + +template +struct index_pair_first_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexPairList().value_known_statically(i) & + (IndexPairList().operator[](i).first == value); + } +}; + + + +template +struct index_pair_second_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_pair_second_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexPairList().value_known_statically(i) & + (IndexPairList().operator[](i).second == value); + } +}; + +template +struct index_pair_second_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexPairList().value_known_statically(i) & + (IndexPairList().operator[](i).second == value); + } +}; + + +} // end namespace internal +} // end namespace Eigen + +#else + +namespace Eigen { +namespace internal { + +template +struct index_known_statically_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { + return false; + } +}; + +template +struct all_indices_known_statically_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { + return false; + } +}; + +template +struct indices_statically_known_to_increase_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { + return false; + } +}; + +template +struct index_statically_eq_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_statically_ne_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_statically_gt_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_statically_lt_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_pair_first_statically_eq_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_pair_second_statically_eq_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + return false; + } +}; + + + +} // end namespace internal +} // end namespace Eigen + +#endif + + +namespace Eigen { +namespace internal { +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(DenseIndex i) { + return index_known_statically_impl::run(i); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() { + return all_indices_known_statically_impl::run(); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() { + return indices_statically_known_to_increase_impl::run(); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(DenseIndex i, DenseIndex value) { + return index_statically_eq_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(DenseIndex i, DenseIndex value) { + return index_statically_ne_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(DenseIndex i, DenseIndex value) { + return index_statically_gt_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i, DenseIndex value) { + return index_statically_lt_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) { + return index_pair_first_statically_eq_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) { + return index_pair_second_statically_eq_impl::run(i, value); +} + +} // end namespace internal +} // end namespace Eigen + + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h new file mode 100644 index 00000000..1f3d4426 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -0,0 +1,229 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Ke Yang +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H +#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H + +namespace Eigen { + +/** \class TensorInflation + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor inflation class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorInflationOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorInflationOp type; +}; + +} // end namespace internal + +template +class TensorInflationOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides) + : m_xpr(expr), m_strides(strides) {} + + EIGEN_DEVICE_FUNC + const Strides& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Strides m_strides; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorInflationOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_strides(op.strides()) + { + m_dimensions = m_impl.dimensions(); + // Expand each dimension to the inflated dimension. + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1; + } + + // Remember the strides for fast division. + for (int i = 0; i < NumDims; ++i) { + m_fastStrides[i] = internal::TensorIntDivisor(m_strides[i]); + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + } + } else { // RowMajor + m_outputStrides[NumDims-1] = 1; + m_inputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + // Computes the input index given the output index. Returns true if the output + // index doesn't fall into a hole. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const + { + eigen_assert(index < dimensions().TotalSize()); + *inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (idx != idx / m_fastStrides[i] * m_strides[i]) { + return false; + } + *inputIndex += idx / m_strides[i] * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (index != index / m_fastStrides[0] * m_strides[0]) { + return false; + } + *inputIndex += index / m_strides[0]; + return true; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (idx != idx / m_fastStrides[i] * m_strides[i]) { + return false; + } + *inputIndex += idx / m_strides[i] * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (index != index / m_fastStrides[NumDims-1] * m_strides[NumDims-1]) { + return false; + } + *inputIndex += index / m_strides[NumDims - 1]; + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index inputIndex = 0; + if (getInputIndex(index, &inputIndex)) { + return m_impl.coeff(inputIndex); + } else { + return Scalar(0); + } + } + + // TODO(yangke): optimize this function so that we can detect and produce + // all-zero packets + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (3 * TensorOpCost::DivCost() + + 3 * TensorOpCost::MulCost() + + 2 * TensorOpCost::AddCost()); + const double input_size = m_impl.dimensions().TotalSize(); + const double output_size = m_dimensions.TotalSize(); + if (output_size == 0) + return TensorOpCost(); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0, + compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + const Strides m_strides; + array, NumDims> m_fastStrides; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h new file mode 100644 index 00000000..8dbe15be --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h @@ -0,0 +1,82 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H +#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H + +#if EIGEN_HAS_VARIADIC_TEMPLATES + +#include + +namespace Eigen { + +/** \class TensorInitializer + * \ingroup CXX11_Tensor_Module + * + * \brief Helper template to initialize Tensors from std::initializer_lists. + */ +namespace internal { + +template +struct Initializer { + typedef std::initializer_list< + typename Initializer::InitList> InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + for (auto v : vals) { + (*indices)[traits::NumDimensions - N] = i++; + Initializer::run(tensor, indices, v); + } + } +}; + +template +struct Initializer { + typedef std::initializer_list::Scalar> InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + // There is likely a faster way to do that than iterating. + for (auto v : vals) { + (*indices)[traits::NumDimensions - 1] = i++; + tensor.coeffRef(*indices) = v; + } + } +}; + +template +struct Initializer { + typedef typename traits::Scalar InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>*, + const InitList& v) { + tensor.coeffRef(0) = v; + } +}; + + +template +void initialize_tensor(TensorEvaluator& tensor, + const typename Initializer::NumDimensions>::InitList& vals) { + Eigen::array::Index, traits::NumDimensions> indices; + Initializer::NumDimensions>::run(tensor, &indices, vals); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_HAS_VARIADIC_TEMPLATES + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h new file mode 100644 index 00000000..dc1b3b87 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -0,0 +1,253 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H +#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H + + +namespace Eigen { + +/** \internal + * + * \class TensorIntDiv + * \ingroup CXX11_Tensor_Module + * + * \brief Fast integer division by a constant. + * + * See the paper from Granlund and Montgomery for explanation. + * (at xxxp://dx.doi.org/10.1145/773473.178249) + * + * \sa Tensor + */ + +namespace internal { + +namespace { + + // Note: result is undefined if val == 0 + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + typename internal::enable_if::type count_leading_zeros(const T val) + { +#ifdef __CUDA_ARCH__ + return __clz(val); +#elif EIGEN_COMP_MSVC + unsigned long index; + _BitScanReverse(&index, val); + return 31 - index; +#else + EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); + return __builtin_clz(static_cast(val)); +#endif + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + typename internal::enable_if::type count_leading_zeros(const T val) + { +#ifdef __CUDA_ARCH__ + return __clzll(val); +#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 + unsigned long index; + _BitScanReverse64(&index, val); + return 63 - index; +#elif EIGEN_COMP_MSVC + // MSVC's _BitScanReverse64 is not available for 32bits builds. + unsigned int lo = (unsigned int)(val&0xffffffff); + unsigned int hi = (unsigned int)((val>>32)&0xffffffff); + int n; + if(hi==0) + n = 32 + count_leading_zeros(lo); + else + n = count_leading_zeros(hi); + return n; +#else + EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); + return __builtin_clzll(static_cast(val)); +#endif + } + + template + struct UnsignedTraits { + typedef typename conditional::type type; + }; + + template + struct DividerTraits { + typedef typename UnsignedTraits::type type; + static const int N = sizeof(T) * 8; + }; + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { +#if defined(__CUDA_ARCH__) + return __umulhi(a, b); +#else + return (static_cast(a) * b) >> 32; +#endif + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { +#if defined(__CUDA_ARCH__) + return __umul64hi(a, b); +#elif defined(__SIZEOF_INT128__) + __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); + return static_cast(v >> 64); +#else + return (TensorUInt128, uint64_t>(a) * TensorUInt128, uint64_t>(b)).upper(); +#endif + } + + template + struct DividerHelper { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) { + EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE); + return static_cast((static_cast(1) << (N+log_div)) / divider - (static_cast(1) << N) + 1); + } + }; + + template + struct DividerHelper<64, T> { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { +#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) + return static_cast((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); +#else + const uint64_t shift = 1ULL << log_div; + TensorUInt128 result = TensorUInt128 >(shift, 0) / TensorUInt128, uint64_t>(divider) + - TensorUInt128, static_val<0> >(1, 0) + + TensorUInt128, static_val<1> >(1); + return static_cast(result); +#endif + } + }; +} + + +template +struct TensorIntDivisor { + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { + multiplier = 0; + shift1 = 0; + shift2 = 0; + } + + // Must have 0 < divider < 2^31. This is relaxed to + // 0 < divider < 2^63 when using 64-bit indices on platforms that support + // the __uint128_t type. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) { + const int N = DividerTraits::N; + eigen_assert(static_cast::type>(divider) < NumTraits::highest()/2); + eigen_assert(divider > 0); + + // fast ln2 + const int leading_zeros = count_leading_zeros(static_cast(divider)); + int log_div = N - leading_zeros; + // if divider is a power of two then log_div is 1 more than it should be. + if ((static_cast::type>(1) << (log_div-1)) == static_cast::type>(divider)) + log_div--; + + multiplier = DividerHelper::computeMultiplier(log_div, divider); + shift1 = log_div > 1 ? 1 : log_div; + shift2 = log_div > 1 ? log_div-1 : 0; + } + + // Must have 0 <= numerator. On platforms that dont support the __uint128_t + // type numerator should also be less than 2^32-1. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { + eigen_assert(static_cast::type>(numerator) < NumTraits::highest()/2); + //eigen_assert(numerator >= 0); // this is implicitly asserted by the line above + + UnsignedType t1 = muluh(multiplier, numerator); + UnsignedType t = (static_cast(numerator) - t1) >> shift1; + return (t1 + t) >> shift2; + } + + private: + typedef typename DividerTraits::type UnsignedType; + UnsignedType multiplier; + int32_t shift1; + int32_t shift2; +}; + + +// Optimized version for signed 32 bit integers. +// Derived from Hacker's Delight. +// Only works for divisors strictly greater than one +template <> +class TensorIntDivisor { + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { + magic = 0; + shift = 0; + } + // Must have 2 <= divider + EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider) { + eigen_assert(divider >= 2); + calcMagic(divider); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { +#ifdef __CUDA_ARCH__ + return (__umulhi(magic, n) >> shift); +#else + uint64_t v = static_cast(magic) * static_cast(n); + return (static_cast(v >> 32) >> shift); +#endif + } + +private: + // Compute the magic numbers. See Hacker's Delight section 10 for an in + // depth explanation. + EIGEN_DEVICE_FUNC void calcMagic(int32_t d) { + const unsigned two31 = 0x80000000; // 2**31. + unsigned ad = d; + unsigned t = two31 + (ad >> 31); + unsigned anc = t - 1 - t%ad; // Absolute value of nc. + int p = 31; // Init. p. + unsigned q1 = two31/anc; // Init. q1 = 2**p/|nc|. + unsigned r1 = two31 - q1*anc; // Init. r1 = rem(2**p, |nc|). + unsigned q2 = two31/ad; // Init. q2 = 2**p/|d|. + unsigned r2 = two31 - q2*ad; // Init. r2 = rem(2**p, |d|). + unsigned delta = 0; + do { + p = p + 1; + q1 = 2*q1; // Update q1 = 2**p/|nc|. + r1 = 2*r1; // Update r1 = rem(2**p, |nc|). + if (r1 >= anc) { // (Must be an unsigned + q1 = q1 + 1; // comparison here). + r1 = r1 - anc;} + q2 = 2*q2; // Update q2 = 2**p/|d|. + r2 = 2*r2; // Update r2 = rem(2**p, |d|). + if (r2 >= ad) { // (Must be an unsigned + q2 = q2 + 1; // comparison here). + r2 = r2 - ad;} + delta = ad - r2; + } while (q1 < delta || (q1 == delta && r1 == 0)); + + magic = (unsigned)(q2 + 1); + shift = p - 32; + } + + uint32_t magic; + int32_t shift; +}; + + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor& divisor) { + return divisor.divide(numerator); +} + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h new file mode 100644 index 00000000..e874e94f --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -0,0 +1,209 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H + +namespace Eigen { + +/** \class TensorLayoutSwap + * \ingroup CXX11_Tensor_Module + * + * \brief Swap the layout from col-major to row-major, or row-major + * to col-major, and invert the order of the dimensions. + * + * Beware: the dimensions are reversed by this operation. If you want to + * preserve the ordering of the dimensions, you need to combine this + * operation with a shuffle. + * + * \example: + * Tensor input(2, 4); + * Tensor output = input.swap_layout(); + * eigen_assert(output.dimension(0) == 4); + * eigen_assert(output.dimension(1) == 2); + * + * array shuffle(1, 0); + * output = input.swap_layout().shuffle(shuffle); + * eigen_assert(output.dimension(0) == 2); + * eigen_assert(output.dimension(1) == 4); + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = (traits::Layout == ColMajor) ? RowMajor : ColMajor; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorLayoutSwapOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorLayoutSwapOp type; +}; + +} // end namespace internal + + + +template +class TensorLayoutSwapOp : public TensorBase, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorLayoutSwapOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + for(int i = 0; i < NumDims; ++i) { + m_dimensions[i] = m_impl.dimensions()[NumDims-1-i]; + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + return m_impl.evalSubExprsIfNeeded(data); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } + + const TensorEvaluator& impl() const { return m_impl; } + + protected: + TensorEvaluator m_impl; + Dimensions m_dimensions; +}; + + +// Eval as lvalue +template + struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorLayoutSwapOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, + CoordAccess = false // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(index); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + this->m_impl.template writePacket(index, x); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h new file mode 100644 index 00000000..5165bb61 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -0,0 +1,54 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H +#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H + + +/** use this macro in sfinae selection in templated functions + * + * template::value , int >::type = 0 + * > + * void foo(){} + * + * becomes => + * + * template::value ) + * > + * void foo(){} + */ + +// SFINAE requires variadic templates +#ifndef __CUDACC__ +#if EIGEN_HAS_VARIADIC_TEMPLATES + // SFINAE doesn't work for gcc <= 4.7 + #ifdef EIGEN_COMP_GNUC + #if EIGEN_GNUC_AT_LEAST(4,8) + #define EIGEN_HAS_SFINAE + #endif + #else + #define EIGEN_HAS_SFINAE + #endif +#endif +#endif + +#define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \ + typename internal::enable_if< ( __condition__ ) , int >::type = 0 + + +#if EIGEN_HAS_CONSTEXPR +#define EIGEN_CONSTEXPR constexpr +#else +#define EIGEN_CONSTEXPR +#endif + + +#endif diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h new file mode 100644 index 00000000..ee430ae4 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -0,0 +1,323 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H + +namespace Eigen { + +// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_) + +/** \class TensorMap + * \ingroup CXX11_Tensor_Module + * + * \brief A tensor expression mapping an existing array of data. + * + */ +/// `template class MakePointer_` is added to convert the host pointer to the device pointer. +/// It is added due to the fact that for our device compiler `T*` is not allowed. +/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`. +/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_` is `T*` . +/// Therefore, by adding the default value, we managed to convert the type and it does not break any +/// existing code as its default value is `T*`. +template class MakePointer_> class TensorMap : public TensorBase > +{ + public: + typedef TensorMap Self; + typedef typename PlainObjectType::Base Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::traits::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + /* typedef typename internal::conditional< + bool(internal::is_lvalue::value), + Scalar *, + const Scalar *>::type + PointerType;*/ + typedef typename MakePointer_::Type PointerType; + typedef PointerType PointerArgType; + + static const int Options = Options_; + + static const Index NumIndices = PlainObjectType::NumIndices; + typedef typename PlainObjectType::Dimensions Dimensions; + + enum { + IsAligned = ((int(Options_)&Aligned)==Aligned), + Layout = PlainObjectType::Layout, + CoordAccess = true, + RawAccess = true + }; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { + EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { + EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { + EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { + EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) + { } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor) + : m_data(tensor.data()), m_dimensions(tensor.dimensions()) + { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PointerType data() { return m_data; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const PointerType data() const { return m_data; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const + { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) + return m_data[0]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + { + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i1 + i0 * m_dimensions[1]; + return m_data[index]; + } else { + const Index index = i0 + i1 * m_dimensions[0]; + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); + return m_data[index]; + } + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) + return m_data[0]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) + { + static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + const std::size_t NumDims = sizeof...(otherIndices) + 2; + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i1 + i0 * m_dimensions[1]; + return m_data[index]; + } else { + const Index index = i0 + i1 * m_dimensions[0]; + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); + return m_data[index]; + } + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Self& operator=(const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + private: + typename MakePointer_::Type m_data; + Dimensions m_dimensions; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h new file mode 100644 index 00000000..35075f85 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -0,0 +1,218 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H +#define EIGEN_CXX11_TENSOR_TENSOR_META_H + +namespace Eigen { + +template struct Cond {}; + +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +const T1& choose(Cond, const T1& first, const T2&) { + return first; +} + +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +const T2& choose(Cond, const T1&, const T2& second) { + return second; +} + + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T divup(const X x, const Y y) { + return static_cast((x + y - 1) / y); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T divup(const T x, const T y) { + return static_cast((x + y - 1) / y); +} + +template struct max_n_1 { + static const size_t size = n; +}; +template <> struct max_n_1<0> { + static const size_t size = 1; +}; + + +// Default packet types +template +struct PacketType : internal::packet_traits { + typedef typename internal::packet_traits::type type; +}; + +// For CUDA packet types when using a GpuDevice +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16) +template <> +struct PacketType { + typedef half2 type; + static const int size = 2; + enum { + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasLog = 1, + HasLog1p = 0, + HasLog10 = 0, + HasPow = 1, + }; +}; +#endif + +#if defined(EIGEN_USE_SYCL) +template + struct PacketType { + typedef T type; + static const int size = 1; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasArg = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasBlend = 0 + }; +}; +#endif + + +// Tuple mimics std::pair but works on e.g. nvcc. +template struct Tuple { + public: + U first; + V second; + + typedef U first_type; + typedef V second_type; + + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Tuple() : first(), second() {} + + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Tuple(const U& f, const V& s) : first(f), second(s) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Tuple& operator= (const Tuple& rhs) { + if (&rhs == this) return *this; + first = rhs.first; + second = rhs.second; + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void swap(Tuple& rhs) { + using numext::swap; + swap(first, rhs.first); + swap(second, rhs.second); + } +}; + +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +bool operator==(const Tuple& x, const Tuple& y) { + return (x.first == y.first && x.second == y.second); +} + +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +bool operator!=(const Tuple& x, const Tuple& y) { + return !(x == y); +} + + +// Can't use std::pairs on cuda devices +template struct IndexPair { + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {} + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {} + + EIGEN_DEVICE_FUNC void set(IndexPair val) { + first = val.first; + second = val.second; + } + + Idx first; + Idx second; +}; + + +#ifdef EIGEN_HAS_SFINAE +namespace internal { + + template + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + array customIndices2Array(IndexType& idx, numeric_list) { + return { idx[Is]... }; + } + template + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + array customIndices2Array(IndexType&, numeric_list) { + return array(); + } + + /** Make an array (for index/dimensions) out of a custom index */ + template + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + array customIndices2Array(IndexType& idx) { + return customIndices2Array(idx, typename gen_numeric_list::type{}); + } + + + template + struct is_base_of + { + + typedef char (&yes)[1]; + typedef char (&no)[2]; + + template + struct Host + { + operator BB*() const; + operator DD*(); + }; + + template + static yes check(D*, T); + static no check(B*, int); + + static const bool value = sizeof(check(Host(), int())) == sizeof(yes); + }; + +} +#endif + + + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_META_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h new file mode 100644 index 00000000..09f06343 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -0,0 +1,888 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H +#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H + +namespace Eigen { + +/** \class TensorReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = array_size::value; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorReshapingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorReshapingOp type; +}; + +} // end namespace internal + + + +template +class TensorReshapingOp : public TensorBase, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC + const NewDimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const NewDimensions m_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorReshapingOp XprType; + typedef NewDimensions Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dimensions(op.dimensions()) + { + // The total size of the reshaped tensor must be equal to the total size + // of the input tensor. + eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + return m_impl.evalSubExprsIfNeeded(data); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast(m_impl.data()); } + + EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } + + protected: + TensorEvaluator m_impl; + NewDimensions m_dimensions; +}; + + +// Eval as lvalue +template + struct TensorEvaluator, Device> + : public TensorEvaluator, Device> + +{ + typedef TensorEvaluator, Device> Base; + typedef TensorReshapingOp XprType; + typedef NewDimensions Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(index); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + this->m_impl.template writePacket(index, x); + } +}; + + +/** \class TensorSlicing + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor slicing class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = array_size::value; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorSlicingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorSlicingOp type; +}; + +} // end namespace internal + + + +template +class TensorSlicingOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes) + : m_xpr(expr), m_indices(indices), m_sizes(sizes) {} + + EIGEN_DEVICE_FUNC + const StartIndices& startIndices() const { return m_indices; } + EIGEN_DEVICE_FUNC + const Sizes& sizes() const { return m_sizes; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + + protected: + typename XprType::Nested m_xpr; + const StartIndices m_indices; + const Sizes m_sizes; +}; + + +// Fixme: figure out the exact threshold +namespace { +template struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { } + EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > threshold_; } + + private: + Index threshold_; +}; + +// It is very expensive to start the memcpy kernel on GPU: we therefore only +// use it for large copies. +#ifdef EIGEN_USE_GPU +template struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { } + EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; } +}; +#endif +} + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorSlicingOp XprType; + static const int NumDims = internal::array_size::value; + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets and sizes. + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) + { + for (std::size_t i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const Sizes& output_dims = op.sizes(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + } + + // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + } + } else { + m_inputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + } + + // Don't initialize m_fastOutputStrides[NumDims-1] since it won't ever be accessed. + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + } + } + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + m_impl.evalSubExprsIfNeeded(NULL); + if (!NumTraits::type>::RequireInitialization && data && m_impl.data()) { + Index contiguous_values = 1; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + } else { + for (int i = NumDims-1; i >= 0; --i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + } + // Use memcpy if it's going to be faster than using the regular evaluation. + const MemcpyTriggerForSlicing trigger(m_device); + if (trigger(contiguous_values)) { + Scalar* src = (Scalar*)m_impl.data(); + for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { + Index offset = srcCoeff(i); + m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar)); + } + return false; + } + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < internal::array_prod(dimensions())); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[0]); + inputIndices[1] += (indices[1] + m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[NumDims-1]); + inputIndices[1] += (indices[1] + m_offsets[NumDims-1]); + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); + } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + Scalar* result = m_impl.data(); + if (result) { + Index offset = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i+1; j < NumDims; ++j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i-1; j >= 0; --j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + } + return result + offset; + } + return NULL; + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[NumDims-1]); + } + return inputIndex; + } + + array m_outputStrides; + array, NumDims> m_fastOutputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + const Device& m_device; + Dimensions m_dimensions; + const StartIndices m_offsets; +}; + + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorSlicingOp XprType; + static const int NumDims = internal::array_size::value; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + const int packetSize = internal::unpacket_traits::size; + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[0]); + inputIndices[1] += (indices[1] + this->m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]); + inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]); + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + this->m_impl.template writePacket(inputIndices[0], x); + } + else { + EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + internal::pstore(values, x); + this->m_impl.coeffRef(inputIndices[0]) = values[0]; + this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + for (int i = 1; i < packetSize-1; ++i) { + this->coeffRef(index+i) = values[i]; + } + } + } +}; + + + +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = array_size::value; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorStridingSlicingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorStridingSlicingOp type; +}; + +} // end namespace internal + + +template +class TensorStridingSlicingOp : public TensorBase > +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp( + const XprType& expr, const StartIndices& startIndices, + const StopIndices& stopIndices, const Strides& strides) + : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices), + m_strides(strides) {} + + EIGEN_DEVICE_FUNC + const StartIndices& startIndices() const { return m_startIndices; } + EIGEN_DEVICE_FUNC + const StartIndices& stopIndices() const { return m_stopIndices; } + EIGEN_DEVICE_FUNC + const StartIndices& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run( + assign, DefaultDevice()); + return *this; + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run( + assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const StartIndices m_startIndices; + const StopIndices m_stopIndices; + const Strides m_strides; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorStridingSlicingOp XprType; + static const int NumDims = internal::array_size::value; + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets and sizes. + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()) + { + // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero + DSizes startIndicesClamped, stopIndicesClamped; + for (size_t i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); + if(m_strides[i]>0){ + startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); + stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); + }else{ + /* implies m_strides[i]<0 by assert */ + startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); + stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); + } + m_startIndices[i] = startIndicesClamped[i]; + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + + // check for degenerate intervals and compute output tensor shape + bool degenerate = false;; + for(int i = 0; i < NumDims; i++){ + Index interval = stopIndicesClamped[i] - startIndicesClamped[i]; + if(interval == 0 || ((interval<0) != (m_strides[i]<0))){ + m_dimensions[i] = 0; + degenerate = true; + }else{ + m_dimensions[i] = interval / m_strides[i] + + (interval % m_strides[i] != 0 ? 1 : 0); + eigen_assert(m_dimensions[i] >= 0); + } + } + Strides output_dims = m_dimensions; + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = m_strides[0]; + m_offsets[0] = startIndicesClamped[0]; + Index previousDimProduct = 1; + for (int i = 1; i < NumDims; ++i) { + previousDimProduct *= input_dims[i-1]; + m_inputStrides[i] = previousDimProduct * m_strides[i]; + m_offsets[i] = startIndicesClamped[i] * previousDimProduct; + } + + // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash + m_fastOutputStrides[i] = internal::TensorIntDivisor(degenerate ? 1 : m_outputStrides[i]); + } + } else { + m_inputStrides[NumDims-1] = m_strides[NumDims-1]; + m_offsets[NumDims-1] = startIndicesClamped[NumDims-1]; + Index previousDimProduct = 1; + for (int i = NumDims - 2; i >= 0; --i) { + previousDimProduct *= input_dims[i+1]; + m_inputStrides[i] = previousDimProduct * m_strides[i]; + m_offsets[i] = startIndicesClamped[i] * previousDimProduct; + } + + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; + // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash + m_fastOutputStrides[i] = internal::TensorIntDivisor(degenerate ? 1 : m_outputStrides[i]); + } + } + m_block_total_size_max = numext::maxi(static_cast(1), + device.lastLevelCacheSize() / + sizeof(Scalar)); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type ScalarNonConst; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Strides Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + return NULL; + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i >= 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i] + m_offsets[i]; + index -= idx * m_outputStrides[i]; + } + } else { + for (int i = 0; i < NumDims; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i] + m_offsets[i]; + index -= idx * m_outputStrides[i]; + } + } + return inputIndex; + } + + static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { + return numext::maxi(min, numext::mini(max,value)); + } + + array m_outputStrides; + array, NumDims> m_fastOutputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + const Device& m_device; + DSizes m_startIndices; // clamped startIndices + DSizes m_dimensions; + DSizes m_offsets; // offset in a flattened shape + const Strides m_strides; + std::size_t m_block_total_size_max; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorStridingSlicingOp XprType; + static const int NumDims = internal::array_size::value; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = TensorEvaluator::CoordAccess, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type ScalarNonConst; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Strides Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h new file mode 100644 index 00000000..66a3b75e --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -0,0 +1,397 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H +#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H + +namespace Eigen { + +/** \class TensorPadding + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor padding class. + * At the moment only padding with a constant value is supported. + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorPaddingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorPaddingOp type; +}; + +} // end namespace internal + + + +template +class TensorPaddingOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value) + : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC + const PaddingDimensions& padding() const { return m_padding_dims; } + EIGEN_DEVICE_FUNC + Scalar padding_value() const { return m_padding_value; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PaddingDimensions m_padding_dims; + const Scalar m_padding_value; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorPaddingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = true, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = true, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()) + { + // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead + // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector + // of 1 element first and then pad. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + + // Compute dimensions + m_dimensions = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] += m_padding[i].first + m_padding[i].second; + } + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1]; + } else { + m_inputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1]; + } + m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(index < dimensions().TotalSize()); + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (isPaddingAtIndexForDim(idx, i)) { + return m_paddingValue; + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (isPaddingAtIndexForDim(index, 0)) { + return m_paddingValue; + } + inputIndex += (index - m_padding[0].first); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i+1]; + if (isPaddingAtIndexForDim(idx, i)) { + return m_paddingValue; + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i+1]; + } + if (isPaddingAtIndexForDim(index, NumDims-1)) { + return m_paddingValue; + } + inputIndex += (index - m_padding[NumDims-1].first); + } + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + if (static_cast(Layout) == static_cast(ColMajor)) { + return packetColMajor(index); + } + return packetRowMajor(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + TensorOpCost cost = m_impl.costPerCoeff(vectorized); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) + updateCostPerDimension(cost, i, i == 0); + } else { + for (int i = NumDims - 1; i >= 0; --i) + updateCostPerDimension(cost, i, i == NumDims - 1); + } + return cost; + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( + Index index, int dim_index) const { +#if defined(EIGEN_HAS_INDEX_LIST) + return (!internal::index_pair_first_statically_eq(dim_index, 0) && + index < m_padding[dim_index].first) || + (!internal::index_pair_second_statically_eq(dim_index, 0) && + index >= m_dimensions[dim_index] - m_padding[dim_index].second); +#else + return (index < m_padding[dim_index].first) || + (index >= m_dimensions[dim_index] - m_padding[dim_index].second); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero( + int dim_index) const { +#if defined(EIGEN_HAS_INDEX_LIST) + return internal::index_pair_first_statically_eq(dim_index, 0); +#else + EIGEN_UNUSED_VARIABLE(dim_index); + return false; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero( + int dim_index) const { +#if defined(EIGEN_HAS_INDEX_LIST) + return internal::index_pair_second_statically_eq(dim_index, 0); +#else + EIGEN_UNUSED_VARIABLE(dim_index); + return false; +#endif + } + + + void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const { + const double in = static_cast(m_impl.dimensions()[i]); + const double out = in + m_padding[i].first + m_padding[i].second; + if (out == 0) + return; + const double reduction = in / out; + cost *= reduction; + if (first) { + cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + + reduction * (1 * TensorOpCost::AddCost())); + } else { + cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + + reduction * (2 * TensorOpCost::MulCost() + + 1 * TensorOpCost::DivCost())); + } + } + + protected: + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + const Index initialIndex = index; + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index first = index; + const Index last = index + PacketSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; + const Index lastPaddedRight = m_outputStrides[i+1]; + + if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index last = index + PacketSize - 1; + const Index first = index; + const Index lastPaddedLeft = m_padding[0].first; + const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); + const Index lastPaddedRight = m_outputStrides[1]; + + if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[0].first); + return m_impl.template packet(inputIndex); + } + // Every other case + return packetWithPossibleZero(initialIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + const Index initialIndex = index; + Index inputIndex = 0; + + for (int i = 0; i < NumDims - 1; ++i) { + const Index first = index; + const Index last = index + PacketSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; + const Index lastPaddedRight = m_outputStrides[i]; + + if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i+1]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i+1]; + } + else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index last = index + PacketSize - 1; + const Index first = index; + const Index lastPaddedLeft = m_padding[NumDims-1].first; + const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); + const Index lastPaddedRight = m_outputStrides[NumDims-1]; + + if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[NumDims-1].first); + return m_impl.template packet(inputIndex); + } + // Every other case + return packetWithPossibleZero(initialIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const + { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + PaddingDimensions m_padding; + + Scalar m_paddingValue; +}; + + + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h new file mode 100644 index 00000000..cadb5fde --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -0,0 +1,269 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H + +namespace Eigen { + +/** \class TensorPatch + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor patch class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorPatchOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorPatchOp type; +}; + +} // end namespace internal + + + +template +class TensorPatchOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims) + : m_xpr(expr), m_patch_dims(patch_dims) {} + + EIGEN_DEVICE_FUNC + const PatchDim& patch_dims() const { return m_patch_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PatchDim m_patch_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorPatchOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value + 1; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + Index num_patches = 1; + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const PatchDim& patch_dims = op.patch_dims(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims-1; ++i) { + m_dimensions[i] = patch_dims[i]; + num_patches *= (input_dims[i] - patch_dims[i] + 1); + } + m_dimensions[NumDims-1] = num_patches; + + m_inputStrides[0] = 1; + m_patchStrides[0] = 1; + for (int i = 1; i < NumDims-1; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1); + } + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } else { + for (int i = 0; i < NumDims-1; ++i) { + m_dimensions[i+1] = patch_dims[i]; + num_patches *= (input_dims[i] - patch_dims[i] + 1); + } + m_dimensions[0] = num_patches; + + m_inputStrides[NumDims-2] = 1; + m_patchStrides[NumDims-2] = 1; + for (int i = NumDims-3; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_patchStrides[i] = m_patchStrides[i+1] * (input_dims[i+1] - patch_dims[i+1] + 1); + } + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims-2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; + // Find the location of the first element of the patch. + Index patchIndex = index / m_outputStrides[output_stride_index]; + // Find the offset of the element wrt the location of the first element. + Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index]; + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = patchOffset / m_outputStrides[i]; + patchOffset -= offsetIdx * m_outputStrides[i]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + } else { + for (int i = 0; i < NumDims - 2; ++i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = patchOffset / m_outputStrides[i+1]; + patchOffset -= offsetIdx * m_outputStrides[i+1]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + } + inputIndex += (patchIndex + patchOffset); + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; + Index indices[2] = {index, index + PacketSize - 1}; + Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index], + indices[1] / m_outputStrides[output_stride_index]}; + Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index], + indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]}; + + Index inputIndices[2] = {0, 0}; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], + patchIndices[1] / m_patchStrides[i]}; + patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; + patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; + + const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i], + patchOffsets[1] / m_outputStrides[i]}; + patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i]; + patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i]; + + inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; + inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; + } + } else { + for (int i = 0; i < NumDims - 2; ++i) { + const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], + patchIndices[1] / m_patchStrides[i]}; + patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; + patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; + + const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i+1], + patchOffsets[1] / m_outputStrides[i+1]}; + patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i+1]; + patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i+1]; + + inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; + inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; + } + } + inputIndices[0] += (patchIndices[0] + patchOffsets[0]); + inputIndices[1] += (patchIndices[1] + patchOffsets[1]); + + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < PacketSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (TensorOpCost::DivCost() + + TensorOpCost::MulCost() + + 2 * TensorOpCost::AddCost()); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + array m_patchStrides; + + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h new file mode 100644 index 00000000..68560d0a --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -0,0 +1,276 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H +#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H + +namespace Eigen { +namespace internal { + +namespace { + +EIGEN_DEVICE_FUNC uint64_t get_random_seed() { +#ifdef __CUDA_ARCH__ + // We don't support 3d kernels since we currently only use 1 and + // 2d kernels. + assert(threadIdx.z == 0); + return clock64() + + blockIdx.x * blockDim.x + threadIdx.x + + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); + +#elif defined _WIN32 + // Use the current time as a baseline. + SYSTEMTIME st; + GetSystemTime(&st); + int time = st.wSecond + 1000 * st.wMilliseconds; + // Mix in a random number to make sure that we get different seeds if + // we try to generate seeds faster than the clock resolution. + // We need 2 random values since the generator only generate 16 bits at + // a time (xxxps://msdn.microsoft.com/en-us/library/398ax69y.aspx) + int rnd1 = ::rand(); + int rnd2 = ::rand(); + uint64_t rnd = (rnd1 | rnd2 << 16) ^ time; + return rnd; + +#elif defined __APPLE__ + // Same approach as for win32, except that the random number generator + // is better (// xxxps://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random). + uint64_t rnd = ::random() ^ mach_absolute_time(); + return rnd; + +#else + // Augment the current time with pseudo random number generation + // to ensure that we get different seeds if we try to generate seeds + // faster than the clock resolution. + timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + uint64_t rnd = ::random() ^ ts.tv_nsec; + return rnd; +#endif +} + +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) { + // TODO: Unify with the implementation in the non blocking thread pool. + uint64_t current = *state; + // Update the internal state + *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; + // Generate the random output (using the PCG-XSH-RS scheme) + return static_cast((current ^ (current >> 22)) >> (22 + (current >> 61))); +} + +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) { + seed = seed ? seed : get_random_seed(); + return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; +} + +} // namespace + + +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +T RandomToTypeUniform(uint64_t* state) { + unsigned rnd = PCG_XSH_RS_generator(state); + return static_cast(rnd); +} + + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Eigen::half RandomToTypeUniform(uint64_t* state) { + Eigen::half result; + // Generate 10 random bits for the mantissa + unsigned rnd = PCG_XSH_RS_generator(state); + result.x = static_cast(rnd & 0x3ffu); + // Set the exponent + result.x |= (static_cast(15) << 10); + // Return the final result + return result - Eigen::half(1.0f); +} + + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float RandomToTypeUniform(uint64_t* state) { + typedef union { + uint32_t raw; + float fp; + } internal; + internal result; + // Generate 23 random bits for the mantissa mantissa + const unsigned rnd = PCG_XSH_RS_generator(state); + result.raw = rnd & 0x7fffffu; + // Set the exponent + result.raw |= (static_cast(127) << 23); + // Return the final result + return result.fp - 1.0f; +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double RandomToTypeUniform(uint64_t* state) { + typedef union { + uint64_t raw; + double dp; + } internal; + internal result; + result.raw = 0; + // Generate 52 random bits for the mantissa + // First generate the upper 20 bits + unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu; + // The generate the lower 32 bits + unsigned rnd2 = PCG_XSH_RS_generator(state); + result.raw = (static_cast(rnd1) << 32) | rnd2; + // Set the exponent + result.raw |= (static_cast(1023) << 52); + // Return the final result + return result.dp - 1.0; +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex RandomToTypeUniform >(uint64_t* state) { + return std::complex(RandomToTypeUniform(state), + RandomToTypeUniform(state)); +} +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex RandomToTypeUniform >(uint64_t* state) { + return std::complex(RandomToTypeUniform(state), + RandomToTypeUniform(state)); +} + +template class UniformRandomGenerator { + public: + static const bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( + uint64_t seed = 0) { + m_state = PCG_XSH_RS_state(seed); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( + const UniformRandomGenerator& other) { + m_state = other.m_state; + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T operator()(Index i) const { + uint64_t local_state = m_state + i; + T result = RandomToTypeUniform(&local_state); + m_state = local_state; + return result; + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(Index i) const { + const int packetSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX T values[packetSize]; + uint64_t local_state = m_state + i; + for (int j = 0; j < packetSize; ++j) { + values[j] = RandomToTypeUniform(&local_state); + } + m_state = local_state; + return internal::pload(values); + } + + private: + mutable uint64_t m_state; +}; + +template +struct functor_traits > { + enum { + // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)). + Cost = 12 * NumTraits::AddCost * + ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)), + PacketAccess = UniformRandomGenerator::PacketAccess + }; +}; + + + +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +T RandomToTypeNormal(uint64_t* state) { + // Use the ratio of uniform method to generate numbers following a normal + // distribution. See for example Numerical Recipes chapter 7.3.9 for the + // details. + T u, v, q; + do { + u = RandomToTypeUniform(state); + v = T(1.7156) * (RandomToTypeUniform(state) - T(0.5)); + const T x = u - T(0.449871); + const T y = numext::abs(v) + T(0.386595); + q = x*x + y * (T(0.196)*y - T(0.25472)*x); + } while (q > T(0.27597) && + (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u)); + + return v/u; +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex RandomToTypeNormal >(uint64_t* state) { + return std::complex(RandomToTypeNormal(state), + RandomToTypeNormal(state)); +} +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex RandomToTypeNormal >(uint64_t* state) { + return std::complex(RandomToTypeNormal(state), + RandomToTypeNormal(state)); +} + + +template class NormalRandomGenerator { + public: + static const bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { + m_state = PCG_XSH_RS_state(seed); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator( + const NormalRandomGenerator& other) { + m_state = other.m_state; + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T operator()(Index i) const { + uint64_t local_state = m_state + i; + T result = RandomToTypeNormal(&local_state); + m_state = local_state; + return result; + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(Index i) const { + const int packetSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX T values[packetSize]; + uint64_t local_state = m_state + i; + for (int j = 0; j < packetSize; ++j) { + values[j] = RandomToTypeNormal(&local_state); + } + m_state = local_state; + return internal::pload(values); + } + + private: + mutable uint64_t m_state; +}; + + +template +struct functor_traits > { + enum { + // On average, we need to generate about 3 random numbers + // 15 mul, 8 add, 1.5 logs + Cost = 3 * functor_traits >::Cost + + 15 * NumTraits::AddCost + 8 * NumTraits::AddCost + + 3 * functor_traits >::Cost / 2, + PacketAccess = NormalRandomGenerator::PacketAccess + }; +}; + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h new file mode 100644 index 00000000..33439138 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -0,0 +1,781 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H + +namespace Eigen { + +/** \class TensorReduction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reduction class. + * + */ + +namespace internal { + template class MakePointer_ > + struct traits > + : traits +{ + typedef traits XprTraits; + typedef typename XprTraits::Scalar Scalar; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + static const int NumDimensions = XprTraits::NumDimensions - array_size::value; + static const int Layout = XprTraits::Layout; + + template struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + }; +}; + +template class MakePointer_> +struct eval, Eigen::Dense> +{ + typedef const TensorReductionOp& type; +}; + +template class MakePointer_> +struct nested, 1, typename eval >::type> +{ + typedef TensorReductionOp type; +}; + + +template struct DimInitializer { + template EIGEN_DEVICE_FUNC + static void run(const InputDims& input_dims, + const array::value>& reduced, + OutputDims* output_dims, ReducedDims* reduced_dims) { + const int NumInputDims = internal::array_size::value; + int outputIndex = 0; + int reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (reduced[i]) { + (*reduced_dims)[reduceIndex] = input_dims[i]; + ++reduceIndex; + } else { + (*output_dims)[outputIndex] = input_dims[i]; + ++outputIndex; + } + } + } +}; + +template <> struct DimInitializer > { + template EIGEN_DEVICE_FUNC + static void run(const InputDims& input_dims, const array&, + Sizes<>*, array* reduced_dims) { + const int NumInputDims = internal::array_size::value; + for (int i = 0; i < NumInputDims; ++i) { + (*reduced_dims)[i] = input_dims[i]; + } + } +}; + + +template +struct are_inner_most_dims { + static const bool value = false; +}; +template +struct preserve_inner_most_dims { + static const bool value = false; +}; + +#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES +template +struct are_inner_most_dims{ + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_eq(0, 0); + static const bool tmp3 = index_statically_eq(array_size::value-1, array_size::value-1); + static const bool value = tmp1 & tmp2 & tmp3; +}; +template +struct are_inner_most_dims{ + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_eq(0, NumTensorDims - array_size::value); + static const bool tmp3 = index_statically_eq(array_size::value - 1, NumTensorDims - 1); + static const bool value = tmp1 & tmp2 & tmp3; + +}; +template +struct preserve_inner_most_dims{ + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_gt(0, 0); + static const bool value = tmp1 & tmp2; + +}; +template +struct preserve_inner_most_dims{ + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_lt(array_size::value - 1, NumTensorDims - 1); + static const bool value = tmp1 & tmp2; +}; +#endif + + +template +struct GenericDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { + EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + GenericDimReducer::reduce(self, input, reducer, accum); + } + } +}; +template +struct GenericDimReducer<0, Self, Op> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { + for (int j = 0; j < self.m_reducedDims[0]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; + reducer.reduce(self.m_impl.coeff(input), accum); + } + } +}; +template +struct GenericDimReducer<-1, Self, Op> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) { + reducer.reduce(self.m_impl.coeff(index), accum); + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + typename Self::CoeffReturnType accum = reducer.initialize(); + for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalize(accum); + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + const int packetSize = internal::unpacket_traits::size; + const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; + typename Self::PacketReturnType p = reducer.template initializePacket(); + for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { + reducer.reducePacket(self.m_impl.template packet(firstIndex + j), &p); + } + typename Self::CoeffReturnType accum = reducer.initialize(); + for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalizeBoth(accum, p); + } +}; + +template +struct InnerMostDimPreserver { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { + eigen_assert(false && "should never be called"); + } +}; + +template +struct InnerMostDimPreserver { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + InnerMostDimPreserver::reduce(self, input, reducer, accum); + } + } +}; + +template +struct InnerMostDimPreserver<0, Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; + reducer.reducePacket(self.m_impl.template packet(input), accum); + } + } +}; +template +struct InnerMostDimPreserver<-1, Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { + eigen_assert(false && "should never be called"); + } +}; + +// Default full reducer +template +struct FullReducer { + static const bool HasOptimizedImplementation = false; + + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) { + const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); + *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); + } +}; + + +#ifdef EIGEN_USE_THREADS +// Multithreaded full reducers +template +struct FullReducerShard { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, + typename Self::Index numValuesToReduce, Op& reducer, + typename Self::CoeffReturnType* output) { + *output = InnerMostDimReducer::reduce( + self, firstIndex, numValuesToReduce, reducer); + } +}; + +// Multithreaded full reducer +template +struct FullReducer { + static const bool HasOptimizedImplementation = !Op::IsStateful; + static const int PacketSize = + unpacket_traits::size; + + // launch one reducer per thread and accumulate the result. + static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, + typename Self::CoeffReturnType* output) { + typedef typename Self::Index Index; + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + if (num_coeffs == 0) { + *output = reducer.finalize(reducer.initialize()); + return; + } + const TensorOpCost cost = + self.m_impl.costPerCoeff(Vectorizable) + + TensorOpCost(0, 0, internal::functor_traits::Cost, Vectorizable, + PacketSize); + const int num_threads = TensorCostModel::numThreads( + num_coeffs, cost, device.numThreads()); + if (num_threads == 1) { + *output = + InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); + return; + } + const Index blocksize = + std::floor(static_cast(num_coeffs) / num_threads); + const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + eigen_assert(num_coeffs >= numblocks * blocksize); + + Barrier barrier(internal::convert_index(numblocks)); + MaxSizeVector shards(numblocks, reducer.initialize()); + for (Index i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier(&barrier, &FullReducerShard::run, + self, i * blocksize, blocksize, reducer, + &shards[i]); + } + typename Self::CoeffReturnType finalShard; + if (numblocks * blocksize < num_coeffs) { + finalShard = InnerMostDimReducer::reduce( + self, numblocks * blocksize, num_coeffs - numblocks * blocksize, + reducer); + } else { + finalShard = reducer.initialize(); + } + barrier.Wait(); + + for (Index i = 0; i < numblocks; ++i) { + reducer.reduce(shards[i], &finalShard); + } + *output = reducer.finalize(finalShard); + } +}; + +#endif + + +// Default inner reducer +template +struct InnerReducer { + static const bool HasOptimizedImplementation = false; + + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; + +// Default outer reducer +template +struct OuterReducer { + static const bool HasOptimizedImplementation = false; + + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; + + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +template +__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); + + +#ifdef EIGEN_HAS_CUDA_FP16 +template +__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); +template +__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*); +template +__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*); + +#endif + +template +__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + +template +__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); +#endif + +} // end namespace internal + + +template class MakePointer_> +class TensorReductionOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims) + { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const XprType& expression() const { return m_expr; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dims& dims() const { return m_dims; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Op& reducer() const { return m_reducer; } + + protected: + typename XprType::Nested m_expr; + const Dims m_dims; + const Op m_reducer; +}; + + +// Eval as rvalue +template class MakePointer_, typename Device> +struct TensorEvaluator, Device> +{ + typedef TensorReductionOp XprType; + typedef typename XprType::Index Index; + typedef ArgType ChildType; + typedef typename TensorEvaluator::Dimensions InputDimensions; + static const int NumInputDims = internal::array_size::value; + static const int NumReducedDims = internal::array_size::value; + static const int NumOutputDims = NumInputDims - NumReducedDims; + typedef typename internal::conditional, DSizes >::type Dimensions; + typedef typename XprType::Scalar Scalar; + typedef TensorEvaluator, Device> Self; + static const bool InputPacketAccess = TensorEvaluator::PacketAccess; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = false, + PacketAccess = Self::InputPacketAccess && Op::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; + static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; + static const bool RunningFullReduction = (NumOutputDims==0); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims()) + { + EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + // Build the bitmap indicating if an input dimension is reduced or not. + for (int i = 0; i < NumInputDims; ++i) { + m_reduced[i] = false; + } + for (int i = 0; i < NumReducedDims; ++i) { + eigen_assert(op.dims()[i] >= 0); + eigen_assert(op.dims()[i] < NumInputDims); + m_reduced[op.dims()[i]] = true; + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + internal::DimInitializer::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims); + + // Precompute output strides. + if (NumOutputDims > 0) { + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_outputStrides.back() = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + } + } + + // Precompute input strides. + if (NumInputDims > 0) { + array input_strides; + if (static_cast(Layout) == static_cast(ColMajor)) { + input_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_strides[i] = input_strides[i-1] * input_dims[i-1]; + } + } else { + input_strides.back() = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + } + } + + int outputIndex = 0; + int reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + m_reducedStrides[reduceIndex] = input_strides[i]; + ++reduceIndex; + } else { + m_preservedStrides[outputIndex] = input_strides[i]; + ++outputIndex; + } + } + } + + // Special case for full reductions + if (NumOutputDims == 0) { + m_preservedStrides[0] = internal::array_prod(input_dims); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_::Type data) { + m_impl.evalSubExprsIfNeeded(NULL); + + // Use the FullReducer if possible. + if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction && + internal::FullReducer::HasOptimizedImplementation && + ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || + !RunningOnGPU))) { + bool need_assign = false; + if (!data) { + m_result = static_cast(m_device.allocate(sizeof(CoeffReturnType))); + data = m_result; + need_assign = true; + } + Op reducer(m_reducer); + internal::FullReducer::run(*this, reducer, m_device, data); + return need_assign; + } + else if(RunningOnSycl){ + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + m_result = data; + } + Op reducer(m_reducer); + internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + return (m_result != NULL); + } + + // Attempt to use an optimized reduction. + else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) { + bool reducing_inner_dims = true; + for (int i = 0; i < NumReducedDims; ++i) { + if (static_cast(Layout) == static_cast(ColMajor)) { + reducing_inner_dims &= m_reduced[i]; + } else { + reducing_inner_dims &= m_reduced[NumInputDims - 1 - i]; + } + } + if (internal::InnerReducer::HasOptimizedImplementation && + (reducing_inner_dims || ReducingInnerMostDims)) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) { + data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + m_result = data; + } + else { + return true; + } + } + Op reducer(m_reducer); + if (internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { + if (m_result) { + m_device.deallocate(m_result); + m_result = NULL; + } + return true; + } else { + return (m_result != NULL); + } + } + + bool preserving_inner_dims = true; + for (int i = 0; i < NumReducedDims; ++i) { + if (static_cast(Layout) == static_cast(ColMajor)) { + preserving_inner_dims &= m_reduced[NumInputDims - 1 - i]; + } else { + preserving_inner_dims &= m_reduced[i]; + } + } + if (internal::OuterReducer::HasOptimizedImplementation && + preserving_inner_dims) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) { + data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + m_result = data; + } + else { + return true; + } + } + Op reducer(m_reducer); + if (internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { + if (m_result) { + m_device.deallocate(m_result); + m_result = NULL; + } + return true; + } else { + return (m_result != NULL); + } + } + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + if (m_result) { + m_device.deallocate(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) { + return *(m_result + index); + } + Op reducer(m_reducer); + if (ReducingInnerMostDims || RunningFullReduction) { + const Index num_values_to_reduce = + (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; + return internal::InnerMostDimReducer::reduce(*this, firstInput(index), + num_values_to_reduce, reducer); + } else { + typename Self::CoeffReturnType accum = reducer.initialize(); + internal::GenericDimReducer::reduce(*this, firstInput(index), reducer, &accum); + return reducer.finalize(accum); + } + } + + // TODO(bsteiner): provide a more efficient implementation. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions()))); + + if (RunningOnGPU && m_result) { + return internal::pload(m_result + index); + } + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + if (ReducingInnerMostDims) { + const Index num_values_to_reduce = + (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; + const Index firstIndex = firstInput(index); + for (Index i = 0; i < PacketSize; ++i) { + Op reducer(m_reducer); + values[i] = internal::InnerMostDimReducer::reduce(*this, firstIndex + i * num_values_to_reduce, + num_values_to_reduce, reducer); + } + } else if (PreservingInnerMostDims) { + const Index firstIndex = firstInput(index); + const int innermost_dim = (static_cast(Layout) == static_cast(ColMajor)) ? 0 : NumOutputDims - 1; + // TBD: extend this the the n innermost dimensions that we preserve. + if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) { + Op reducer(m_reducer); + typename Self::PacketReturnType accum = reducer.template initializePacket(); + internal::InnerMostDimPreserver::reduce(*this, firstIndex, reducer, &accum); + return reducer.finalizePacket(accum); + } else { + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + } + } else { + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + // Must be called after evalSubExprsIfNeeded(). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + if (RunningFullReduction && m_result) { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } else { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const double compute_cost = num_values_to_reduce * internal::functor_traits::Cost; + return m_impl.costPerCoeff(vectorized) * num_values_to_reduce + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + } + + EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& impl() const { return m_impl; } + /// added for sycl in order to construct the buffer from the sycl device + const Device& device() const{return m_device;} + /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel + const Dims& xprDims() const {return m_xpr_dims;} + + + private: + template friend struct internal::GenericDimReducer; + template friend struct internal::InnerMostDimReducer; + template friend struct internal::InnerMostDimPreserver; + template friend struct internal::FullReducer; +#ifdef EIGEN_USE_THREADS + template friend struct internal::FullReducerShard; +#endif +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); +#ifdef EIGEN_HAS_CUDA_FP16 + template friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); + template friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); + template friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); +#endif + template friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + + template friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); +#endif + + template friend struct internal::InnerReducer; + + // Returns the Index in the input tensor of the first value that needs to be + // used to compute the reduction at output index "index". + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + if (ReducingInnerMostDims) { + if (static_cast(Layout) == static_cast(ColMajor)) { + return index * m_preservedStrides[0]; + } else { + return index * m_preservedStrides[NumPreservedStrides - 1]; + } + } + // TBD: optimize the case where we preserve the innermost dimensions. + Index startInput = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + // This is index_i in the output tensor. + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (PreservingInnerMostDims) { + eigen_assert(m_preservedStrides[0] == 1); + startInput += index; + } else { + startInput += index * m_preservedStrides[0]; + } + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + // This is index_i in the output tensor. + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (PreservingInnerMostDims) { + eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1); + startInput += index; + } else { + startInput += index * m_preservedStrides[NumPreservedStrides - 1]; + } + } + return startInput; + } + + // Bitmap indicating if an input dimension is reduced or not. + array m_reduced; + // Dimensions of the output of the operation. + Dimensions m_dimensions; + // Precomputed strides for the output tensor. + array m_outputStrides; + // Subset of strides of the input tensor for the non-reduced dimensions. + // Indexed by output dimensions. + static const int NumPreservedStrides = max_n_1::size; + array m_preservedStrides; + + // Subset of strides of the input tensor for the reduced dimensions. + // Indexed by reduced dimensions. + array m_reducedStrides; + // Size of the input dimensions that are reduced. + // Indexed by reduced dimensions. + array m_reducedDims; + + // Evaluator for the input expression. + TensorEvaluator m_impl; + + // Operation to apply for computing the reduction. + Op m_reducer; + + // For full reductions +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + static const bool RunningOnGPU = internal::is_same::value; + static const bool RunningOnSycl = false; +#elif defined(EIGEN_USE_SYCL) +static const bool RunningOnSycl = internal::is_same::type, Eigen::SyclDevice>::value; +static const bool RunningOnGPU = false; +#else + static const bool RunningOnGPU = false; + static const bool RunningOnSycl = false; +#endif + typename MakePointer_::Type m_result; + + const Device& m_device; + const Dims& m_xpr_dims; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h new file mode 100644 index 00000000..90f43793 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -0,0 +1,750 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H + +namespace Eigen { +namespace internal { + + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +// Full reducers for GPU, don't vectorize for now + +// Reducer function that enables multiple cuda thread to safely accumulate at the same +// output address. It basically reads the current value of the output variable, and +// attempts to update it with the new value. If in the meantime another cuda thread +// updated the content of the output address it will try again. +template +__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { +#if __CUDA_ARCH__ >= 300 + if (sizeof(T) == 4) + { + unsigned int oldval = *reinterpret_cast(output); + unsigned int newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } + else if (sizeof(T) == 8) { + unsigned long long oldval = *reinterpret_cast(output); + unsigned long long newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned long long readback; + while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } + else { + assert(0 && "Wordsize not supported"); + } +#else + assert(0 && "Shouldn't be called on unsupported device"); +#endif +} + +// We extend atomicExch to support extra data types +template +__device__ inline Type atomicExchCustom(Type* address, Type val) { + return atomicExch(address, val); +} + +template <> +__device__ inline double atomicExchCustom(double* address, double val) { + unsigned long long int* address_as_ull = reinterpret_cast(address); + return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); +} + +#ifdef EIGEN_HAS_CUDA_FP16 +template