diff --git a/eigenlib/howto.txt b/eigenlib/howto.txt index d88199d5..5a7e5bef 100644 --- a/eigenlib/howto.txt +++ b/eigenlib/howto.txt @@ -8,8 +8,8 @@ Current Eigen Version 3.3.9 (04.12.2020) updated on 15/06/2021 To update the lib: - download Eigen - unzip it somewhere -- delete (in the filesystem) the content of the folder eigenlib/Eigen - copy the folders 'Eigen' there -- execute the two following shell commands in the folder Eigen +- delete (in the filesystem) the content of the folder eigenlib/Eigen - copy the folders 'Eigen' and 'unsupported' there +- execute the two following shell commands in the folder 'Eigen' and 'unsupported' grep -RiIl 'http://mozilla.org/MPL/2.0/.' * | xargs sed -i 's/http:\/\/mozilla.org\/MPL\/2.0\/./the mozilla.org home page/g' grep -RiIl 'http' * | xargs sed -i 's/http/xxxp/g' diff --git a/eigenlib/unsupported/Eigen/AdolcForward b/eigenlib/unsupported/Eigen/AdolcForward new file mode 100644 index 00000000..0390396f --- /dev/null +++ b/eigenlib/unsupported/Eigen/AdolcForward @@ -0,0 +1,156 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_ADLOC_FORWARD +#define EIGEN_ADLOC_FORWARD + +//-------------------------------------------------------------------------------- +// +// This file provides support for adolc's adouble type in forward mode. +// ADOL-C is a C++ automatic differentiation library, +// see xxxps://projects.coin-or.org/ADOL-C for more information. +// +// Note that the maximal number of directions is controlled by +// the preprocessor token NUMBER_DIRECTIONS. The default is 2. +// +//-------------------------------------------------------------------------------- + +#define ADOLC_TAPELESS +#ifndef NUMBER_DIRECTIONS +# define NUMBER_DIRECTIONS 2 +#endif +#include + +// adolc defines some very stupid macros: +#if defined(malloc) +# undef malloc +#endif + +#if defined(calloc) +# undef calloc +#endif + +#if defined(realloc) +# undef realloc +#endif + +#include + +namespace Eigen { + +/** + * \defgroup AdolcForward_Module Adolc forward module + * This module provides support for adolc's adouble type in forward mode. + * ADOL-C is a C++ automatic differentiation library, + * see xxxps://projects.coin-or.org/ADOL-C for more information. + * It mainly consists in: + * - a struct Eigen::NumTraits specialization + * - overloads of internal::* math function for adtl::adouble type. + * + * Note that the maximal number of directions is controlled by + * the preprocessor token NUMBER_DIRECTIONS. The default is 2. + * + * \code + * #include + * \endcode + */ + //@{ + +} // namespace Eigen + +// Eigen's require a few additional functions which must be defined in the same namespace +// than the custom scalar type own namespace +namespace adtl { + +inline const adouble& conj(const adouble& x) { return x; } +inline const adouble& real(const adouble& x) { return x; } +inline adouble imag(const adouble&) { return 0.; } +inline adouble abs(const adouble& x) { return fabs(x); } +inline adouble abs2(const adouble& x) { return x*x; } + +} + +namespace Eigen { + +template<> struct NumTraits + : NumTraits +{ + typedef adtl::adouble Real; + typedef adtl::adouble NonInteger; + typedef adtl::adouble Nested; + enum { + IsComplex = 0, + IsInteger = 0, + IsSigned = 1, + RequireInitialization = 1, + ReadCost = 1, + AddCost = 1, + MulCost = 1 + }; +}; + +template class AdolcForwardJacobian : public Functor +{ + typedef adtl::adouble ActiveScalar; +public: + + AdolcForwardJacobian() : Functor() {} + AdolcForwardJacobian(const Functor& f) : Functor(f) {} + + // forward constructors + template + AdolcForwardJacobian(const T0& a0) : Functor(a0) {} + template + AdolcForwardJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {} + template + AdolcForwardJacobian(const T0& a0, const T1& a1, const T1& a2) : Functor(a0, a1, a2) {} + + typedef typename Functor::InputType InputType; + typedef typename Functor::ValueType ValueType; + typedef typename Functor::JacobianType JacobianType; + + typedef Matrix ActiveInput; + typedef Matrix ActiveValue; + + void operator() (const InputType& x, ValueType* v, JacobianType* _jac) const + { + eigen_assert(v!=0); + if (!_jac) + { + Functor::operator()(x, v); + return; + } + + JacobianType& jac = *_jac; + + ActiveInput ax = x.template cast(); + ActiveValue av(jac.rows()); + + for (int j=0; j +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_ALIGNED_VECTOR3 +#define EIGEN_ALIGNED_VECTOR3 + +#include + +namespace Eigen { + +/** + * \defgroup AlignedVector3_Module Aligned vector3 module + * + * \code + * #include + * \endcode + */ + //@{ + + +/** \class AlignedVector3 + * + * \brief A vectorization friendly 3D vector + * + * This class represents a 3D vector internally using a 4D vector + * such that vectorization can be seamlessly enabled. Of course, + * the same result can be achieved by directly using a 4D vector. + * This class makes this process simpler. + * + */ +// TODO specialize Cwise +template class AlignedVector3; + +namespace internal { +template struct traits > + : traits > +{ +}; +} + +template class AlignedVector3 + : public MatrixBase > +{ + typedef Matrix<_Scalar,4,1> CoeffType; + CoeffType m_coeffs; + public: + + typedef MatrixBase > Base; + EIGEN_DENSE_PUBLIC_INTERFACE(AlignedVector3) + using Base::operator*; + + inline Index rows() const { return 3; } + inline Index cols() const { return 1; } + + Scalar* data() { return m_coeffs.data(); } + const Scalar* data() const { return m_coeffs.data(); } + Index innerStride() const { return 1; } + Index outerStride() const { return 3; } + + inline const Scalar& coeff(Index row, Index col) const + { return m_coeffs.coeff(row, col); } + + inline Scalar& coeffRef(Index row, Index col) + { return m_coeffs.coeffRef(row, col); } + + inline const Scalar& coeff(Index index) const + { return m_coeffs.coeff(index); } + + inline Scalar& coeffRef(Index index) + { return m_coeffs.coeffRef(index);} + + + inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z) + : m_coeffs(x, y, z, Scalar(0)) + {} + + inline AlignedVector3(const AlignedVector3& other) + : Base(), m_coeffs(other.m_coeffs) + {} + + template + struct generic_assign_selector {}; + + template struct generic_assign_selector + { + inline static void run(AlignedVector3& dest, const XprType& src) + { + dest.m_coeffs = src; + } + }; + + template struct generic_assign_selector + { + inline static void run(AlignedVector3& dest, const XprType& src) + { + dest.m_coeffs.template head<3>() = src; + dest.m_coeffs.w() = Scalar(0); + } + }; + + template + inline AlignedVector3(const MatrixBase& other) + { + generic_assign_selector::run(*this,other.derived()); + } + + inline AlignedVector3& operator=(const AlignedVector3& other) + { m_coeffs = other.m_coeffs; return *this; } + + template + inline AlignedVector3& operator=(const MatrixBase& other) + { + generic_assign_selector::run(*this,other.derived()); + return *this; + } + + inline AlignedVector3 operator+(const AlignedVector3& other) const + { return AlignedVector3(m_coeffs + other.m_coeffs); } + + inline AlignedVector3& operator+=(const AlignedVector3& other) + { m_coeffs += other.m_coeffs; return *this; } + + inline AlignedVector3 operator-(const AlignedVector3& other) const + { return AlignedVector3(m_coeffs - other.m_coeffs); } + + inline AlignedVector3 operator-=(const AlignedVector3& other) + { m_coeffs -= other.m_coeffs; return *this; } + + inline AlignedVector3 operator*(const Scalar& s) const + { return AlignedVector3(m_coeffs * s); } + + inline friend AlignedVector3 operator*(const Scalar& s,const AlignedVector3& vec) + { return AlignedVector3(s * vec.m_coeffs); } + + inline AlignedVector3& operator*=(const Scalar& s) + { m_coeffs *= s; return *this; } + + inline AlignedVector3 operator/(const Scalar& s) const + { return AlignedVector3(m_coeffs / s); } + + inline AlignedVector3& operator/=(const Scalar& s) + { m_coeffs /= s; return *this; } + + inline Scalar dot(const AlignedVector3& other) const + { + eigen_assert(m_coeffs.w()==Scalar(0)); + eigen_assert(other.m_coeffs.w()==Scalar(0)); + return m_coeffs.dot(other.m_coeffs); + } + + inline void normalize() + { + m_coeffs /= norm(); + } + + inline AlignedVector3 normalized() const + { + return AlignedVector3(m_coeffs / norm()); + } + + inline Scalar sum() const + { + eigen_assert(m_coeffs.w()==Scalar(0)); + return m_coeffs.sum(); + } + + inline Scalar squaredNorm() const + { + eigen_assert(m_coeffs.w()==Scalar(0)); + return m_coeffs.squaredNorm(); + } + + inline Scalar norm() const + { + using std::sqrt; + return sqrt(squaredNorm()); + } + + inline AlignedVector3 cross(const AlignedVector3& other) const + { + return AlignedVector3(m_coeffs.cross3(other.m_coeffs)); + } + + template + inline bool isApprox(const MatrixBase& other, const RealScalar& eps=NumTraits::dummy_precision()) const + { + return m_coeffs.template head<3>().isApprox(other,eps); + } + + CoeffType& coeffs() { return m_coeffs; } + const CoeffType& coeffs() const { return m_coeffs; } +}; + +namespace internal { + +template +struct eval, Dense> +{ + typedef const AlignedVector3<_Scalar>& type; +}; + +template +struct evaluator > + : evaluator > +{ + typedef AlignedVector3 XprType; + typedef evaluator > Base; + + evaluator(const XprType &m) : Base(m.coeffs()) {} +}; + +} + +//@} + +} + +#endif // EIGEN_ALIGNED_VECTOR3 diff --git a/eigenlib/unsupported/Eigen/ArpackSupport b/eigenlib/unsupported/Eigen/ArpackSupport new file mode 100644 index 00000000..3cc9ff5b --- /dev/null +++ b/eigenlib/unsupported/Eigen/ArpackSupport @@ -0,0 +1,31 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_ARPACKSUPPORT_MODULE_H +#define EIGEN_ARPACKSUPPORT_MODULE_H + +#include + +/** \defgroup ArpackSupport_Module Arpack support module + * + * This module provides a wrapper to Arpack, a library for sparse eigenvalue decomposition. + * + * \code + * #include + * \endcode + */ + +#include + +#include +#include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h" + +#include + +#endif // EIGEN_ARPACKSUPPORT_MODULE_H +/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/eigenlib/unsupported/Eigen/AutoDiff b/eigenlib/unsupported/Eigen/AutoDiff new file mode 100644 index 00000000..84227510 --- /dev/null +++ b/eigenlib/unsupported/Eigen/AutoDiff @@ -0,0 +1,40 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_AUTODIFF_MODULE +#define EIGEN_AUTODIFF_MODULE + +namespace Eigen { + +/** + * \defgroup AutoDiff_Module Auto Diff module + * + * This module features forward automatic differentation via a simple + * templated scalar type wrapper AutoDiffScalar. + * + * Warning : this should NOT be confused with numerical differentiation, which + * is a different method and has its own module in Eigen : \ref NumericalDiff_Module. + * + * \code + * #include + * \endcode + */ +//@{ + +} + +#include "src/AutoDiff/AutoDiffScalar.h" +// #include "src/AutoDiff/AutoDiffVector.h" +#include "src/AutoDiff/AutoDiffJacobian.h" + +namespace Eigen { +//@} +} + +#endif // EIGEN_AUTODIFF_MODULE diff --git a/eigenlib/unsupported/Eigen/BVH b/eigenlib/unsupported/Eigen/BVH new file mode 100644 index 00000000..278fe645 --- /dev/null +++ b/eigenlib/unsupported/Eigen/BVH @@ -0,0 +1,95 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Ilya Baran +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_BVH_MODULE_H +#define EIGEN_BVH_MODULE_H + +#include +#include +#include +#include +#include + +namespace Eigen { + +/** + * \defgroup BVH_Module BVH module + * \brief This module provides generic bounding volume hierarchy algorithms + * and reference tree implementations. + * + * + * \code + * #include + * \endcode + * + * A bounding volume hierarchy (BVH) can accelerate many geometric queries. This module provides a generic implementation + * of the two basic algorithms over a BVH: intersection of a query object against all objects in the hierarchy and minimization + * of a function over the objects in the hierarchy. It also provides intersection and minimization over a cartesian product of + * two BVH's. A BVH accelerates intersection by using the fact that if a query object does not intersect a volume, then it cannot + * intersect any object contained in that volume. Similarly, a BVH accelerates minimization because the minimum of a function + * over a volume is no greater than the minimum of a function over any object contained in it. + * + * Some sample queries that can be written in terms of intersection are: + * - Determine all points where a ray intersects a triangle mesh + * - Given a set of points, determine which are contained in a query sphere + * - Given a set of spheres, determine which contain the query point + * - Given a set of disks, determine if any is completely contained in a query rectangle (represent each 2D disk as a point \f$(x,y,r)\f$ + * in 3D and represent the rectangle as a pyramid based on the original rectangle and shrinking in the \f$r\f$ direction) + * - Given a set of points, count how many pairs are \f$d\pm\epsilon\f$ apart (done by looking at the cartesian product of the set + * of points with itself) + * + * Some sample queries that can be written in terms of function minimization over a set of objects are: + * - Find the intersection between a ray and a triangle mesh closest to the ray origin (function is infinite off the ray) + * - Given a polyline and a query point, determine the closest point on the polyline to the query + * - Find the diameter of a point cloud (done by looking at the cartesian product and using negative distance as the function) + * - Determine how far two meshes are from colliding (this is also a cartesian product query) + * + * This implementation decouples the basic algorithms both from the type of hierarchy (and the types of the bounding volumes) and + * from the particulars of the query. To enable abstraction from the BVH, the BVH is required to implement a generic mechanism + * for traversal. To abstract from the query, the query is responsible for keeping track of results. + * + * To be used in the algorithms, a hierarchy must implement the following traversal mechanism (see KdBVH for a sample implementation): \code + typedef Volume //the type of bounding volume + typedef Object //the type of object in the hierarchy + typedef Index //a reference to a node in the hierarchy--typically an int or a pointer + typedef VolumeIterator //an iterator type over node children--returns Index + typedef ObjectIterator //an iterator over object (leaf) children--returns const Object & + Index getRootIndex() const //returns the index of the hierarchy root + const Volume &getVolume(Index index) const //returns the bounding volume of the node at given index + void getChildren(Index index, VolumeIterator &outVBegin, VolumeIterator &outVEnd, + ObjectIterator &outOBegin, ObjectIterator &outOEnd) const + //getChildren takes a node index and makes [outVBegin, outVEnd) range over its node children + //and [outOBegin, outOEnd) range over its object children + \endcode + * + * To use the hierarchy, call BVIntersect or BVMinimize, passing it a BVH (or two, for cartesian product) and a minimizer or intersector. + * For an intersection query on a single BVH, the intersector encapsulates the query and must provide two functions: + * \code + bool intersectVolume(const Volume &volume) //returns true if the query intersects the volume + bool intersectObject(const Object &object) //returns true if the intersection search should terminate immediately + \endcode + * The guarantee that BVIntersect provides is that intersectObject will be called on every object whose bounding volume + * intersects the query (but possibly on other objects too) unless the search is terminated prematurely. It is the + * responsibility of the intersectObject function to keep track of the results in whatever manner is appropriate. + * The cartesian product intersection and the BVMinimize queries are similar--see their individual documentation. + * + * The following is a simple but complete example for how to use the BVH to accelerate the search for a closest red-blue point pair: + * \include BVH_Example.cpp + * Output: \verbinclude BVH_Example.out + */ +} + +//@{ + +#include "src/BVH/BVAlgorithms.h" +#include "src/BVH/KdBVH.h" + +//@} + +#endif // EIGEN_BVH_MODULE_H diff --git a/eigenlib/unsupported/Eigen/CMakeLists.txt b/eigenlib/unsupported/Eigen/CMakeLists.txt new file mode 100644 index 00000000..631a0601 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CMakeLists.txt @@ -0,0 +1,32 @@ +set(Eigen_HEADERS + AdolcForward + AlignedVector3 + ArpackSupport + AutoDiff + BVH + EulerAngles + FFT + IterativeSolvers + KroneckerProduct + LevenbergMarquardt + MatrixFunctions + MoreVectorization + MPRealSupport + NonLinearOptimization + NumericalDiff + OpenGLSupport + Polynomials + Skyline + SparseExtra + SpecialFunctions + Splines + ) + +install(FILES + ${Eigen_HEADERS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel + ) + +install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h") + +add_subdirectory(CXX11) diff --git a/eigenlib/unsupported/Eigen/CXX11/CMakeLists.txt b/eigenlib/unsupported/Eigen/CXX11/CMakeLists.txt new file mode 100644 index 00000000..385ed240 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/CMakeLists.txt @@ -0,0 +1,8 @@ +set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool) + +install(FILES + ${Eigen_CXX11_HEADERS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel + ) + +install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h") diff --git a/eigenlib/unsupported/Eigen/CXX11/Tensor b/eigenlib/unsupported/Eigen/CXX11/Tensor new file mode 100644 index 00000000..5d88bb5e --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/Tensor @@ -0,0 +1,154 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +//#ifndef EIGEN_CXX11_TENSOR_MODULE +//#define EIGEN_CXX11_TENSOR_MODULE + +#include "../../../Eigen/Core" + +#ifdef EIGEN_USE_SYCL +#undef min +#undef max +#undef isnan +#undef isinf +#undef isfinite +#include +#include +#include +#include +#endif + +#include + +#include "../SpecialFunctions" +#include "src/util/CXX11Meta.h" +#include "src/util/MaxSizeVector.h" + +/** \defgroup CXX11_Tensor_Module Tensor Module + * + * This module provides a Tensor class for storing arbitrarily indexed + * objects. + * + * \code + * #include + * \endcode + * + * Much of the documentation can be found \ref eigen_tensors "here". + */ + +#include +#include +#include + +#ifdef _WIN32 +typedef __int16 int16_t; +typedef unsigned __int16 uint16_t; +typedef __int32 int32_t; +typedef unsigned __int32 uint32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#else +#include +#endif + +#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900 +#include +#endif + +#ifdef _WIN32 +#include +#elif defined(__APPLE__) +#include +#else +#include +#endif + +#ifdef EIGEN_USE_THREADS +#include "ThreadPool" +#endif + +#ifdef EIGEN_USE_GPU +#include +#include +#if __cplusplus >= 201103L +#include +#include +#endif +#endif + +#include "src/Tensor/TensorMacros.h" +#include "src/Tensor/TensorForwardDeclarations.h" +#include "src/Tensor/TensorMeta.h" +#include "src/Tensor/TensorFunctors.h" +#include "src/Tensor/TensorCostModel.h" +#include "src/Tensor/TensorDeviceDefault.h" +#include "src/Tensor/TensorDeviceThreadPool.h" +#include "src/Tensor/TensorDeviceCuda.h" +#include "src/Tensor/TensorDeviceSycl.h" +#include "src/Tensor/TensorIndexList.h" +#include "src/Tensor/TensorDimensionList.h" +#include "src/Tensor/TensorDimensions.h" +#include "src/Tensor/TensorInitializer.h" +#include "src/Tensor/TensorTraits.h" +#include "src/Tensor/TensorRandom.h" +#include "src/Tensor/TensorUInt128.h" +#include "src/Tensor/TensorIntDiv.h" +#include "src/Tensor/TensorGlobalFunctions.h" + +#include "src/Tensor/TensorBase.h" + +#include "src/Tensor/TensorEvaluator.h" +#include "src/Tensor/TensorExpr.h" +#include "src/Tensor/TensorReduction.h" +#include "src/Tensor/TensorReductionCuda.h" +#include "src/Tensor/TensorArgMax.h" +#include "src/Tensor/TensorConcatenation.h" +#include "src/Tensor/TensorContractionMapper.h" +#include "src/Tensor/TensorContractionBlocking.h" +#include "src/Tensor/TensorContraction.h" +#include "src/Tensor/TensorContractionThreadPool.h" +#include "src/Tensor/TensorContractionCuda.h" +#include "src/Tensor/TensorConversion.h" +#include "src/Tensor/TensorConvolution.h" +#include "src/Tensor/TensorFFT.h" +#include "src/Tensor/TensorPatch.h" +#include "src/Tensor/TensorImagePatch.h" +#include "src/Tensor/TensorVolumePatch.h" +#include "src/Tensor/TensorBroadcasting.h" +#include "src/Tensor/TensorChipping.h" +#include "src/Tensor/TensorInflation.h" +#include "src/Tensor/TensorLayoutSwap.h" +#include "src/Tensor/TensorMorphing.h" +#include "src/Tensor/TensorPadding.h" +#include "src/Tensor/TensorReverse.h" +#include "src/Tensor/TensorShuffling.h" +#include "src/Tensor/TensorStriding.h" +#include "src/Tensor/TensorCustomOp.h" +#include "src/Tensor/TensorEvalTo.h" +#include "src/Tensor/TensorForcedEval.h" +#include "src/Tensor/TensorGenerator.h" +#include "src/Tensor/TensorAssign.h" +#include "src/Tensor/TensorScan.h" + +#include "src/Tensor/TensorSycl.h" +#include "src/Tensor/TensorExecutor.h" +#include "src/Tensor/TensorDevice.h" + +#include "src/Tensor/TensorStorage.h" +#include "src/Tensor/Tensor.h" +#include "src/Tensor/TensorFixedSize.h" +#include "src/Tensor/TensorMap.h" +#include "src/Tensor/TensorRef.h" + +#include "src/Tensor/TensorIO.h" + +#include + +//#endif // EIGEN_CXX11_TENSOR_MODULE diff --git a/eigenlib/unsupported/Eigen/CXX11/TensorSymmetry b/eigenlib/unsupported/Eigen/CXX11/TensorSymmetry new file mode 100644 index 00000000..d39167cf --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/TensorSymmetry @@ -0,0 +1,42 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE +#define EIGEN_CXX11_TENSORSYMMETRY_MODULE + +#include + +#include + +#include "src/util/CXX11Meta.h" + +/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module + * + * This module provides a classes that allow for the definition of + * symmetries w.r.t. tensor indices. + * + * Including this module will implicitly include the Tensor module. + * + * \code + * #include + * \endcode + */ + +#include "src/TensorSymmetry/util/TemplateGroupTheory.h" +#include "src/TensorSymmetry/Symmetry.h" +#include "src/TensorSymmetry/StaticSymmetry.h" +#include "src/TensorSymmetry/DynamicSymmetry.h" + +#include + +#endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/eigenlib/unsupported/Eigen/CXX11/ThreadPool b/eigenlib/unsupported/Eigen/CXX11/ThreadPool new file mode 100644 index 00000000..1455b19d --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/ThreadPool @@ -0,0 +1,65 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_THREADPOOL_MODULE +#define EIGEN_CXX11_THREADPOOL_MODULE + +#include "../../../Eigen/Core" + +#include + +/** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module + * + * This module provides 2 threadpool implementations + * - a simple reference implementation + * - a faster non blocking implementation + * + * This module requires C++11. + * + * \code + * #include + * \endcode + */ + + +// The code depends on CXX11, so only include the module if the +// compiler supports it. +#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900 +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "src/util/CXX11Meta.h" +#include "src/util/MaxSizeVector.h" + +#include "src/ThreadPool/ThreadLocal.h" +#include "src/ThreadPool/ThreadYield.h" +#include "src/ThreadPool/EventCount.h" +#include "src/ThreadPool/RunQueue.h" +#include "src/ThreadPool/ThreadPoolInterface.h" +#include "src/ThreadPool/ThreadEnvironment.h" +#include "src/ThreadPool/SimpleThreadPool.h" +#include "src/ThreadPool/NonBlockingThreadPool.h" + +#endif + +#include + +#endif // EIGEN_CXX11_THREADPOOL_MODULE + diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/README.md b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/README.md new file mode 100644 index 00000000..da70fa21 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -0,0 +1,1760 @@ +# Eigen Tensors {#eigen_tensors} + +Tensors are multidimensional arrays of elements. Elements are typically scalars, +but more complex types such as strings are also supported. + +[TOC] + +## Tensor Classes + +You can manipulate a tensor with one of the following classes. They all are in +the namespace `::Eigen.` + + +### Class Tensor + +This is the class to use to create a tensor and allocate memory for it. The +class is templatized with the tensor datatype, such as float or int, and the +tensor rank. The rank is the number of dimensions, for example rank 2 is a +matrix. + +Tensors of this class are resizable. For example, if you assign a tensor of a +different size to a Tensor, that tensor is resized to match its new value. + +#### Constructor `Tensor(size0, size1, ...)` + +Constructor for a Tensor. The constructor must be passed `rank` integers +indicating the sizes of the instance along each of the the `rank` +dimensions. + + // Create a tensor of rank 3 of sizes 2, 3, 4. This tensor owns + // memory to hold 24 floating point values (24 = 2 x 3 x 4). + Tensor t_3d(2, 3, 4); + + // Resize t_3d by assigning a tensor of different sizes, but same rank. + t_3d = Tensor(3, 4, 3); + +#### Constructor `Tensor(size_array)` + +Constructor where the sizes for the constructor are specified as an array of +values instead of an explicitly list of parameters. The array type to use is +`Eigen::array`. The array can be constructed automatically +from an initializer list. + + // Create a tensor of strings of rank 2 with sizes 5, 7. + Tensor t_2d({5, 7}); + + +### Class `TensorFixedSize>` + +Class to use for tensors of fixed size, where the size is known at compile +time. Fixed sized tensors can provide very fast computations because all their +dimensions are known by the compiler. FixedSize tensors are not resizable. + +If the total number of elements in a fixed size tensor is small enough the +tensor data is held onto the stack and does not cause heap allocation and free. + + // Create a 4 x 3 tensor of floats. + TensorFixedSize> t_4x3; + +### Class `TensorMap>` + +This is the class to use to create a tensor on top of memory allocated and +owned by another part of your code. It allows to view any piece of allocated +memory as a Tensor. Instances of this class do not own the memory where the +data are stored. + +A TensorMap is not resizable because it does not own the memory where its data +are stored. + +#### Constructor `TensorMap>(data, size0, size1, ...)` + +Constructor for a Tensor. The constructor must be passed a pointer to the +storage for the data, and "rank" size attributes. The storage has to be +large enough to hold all the data. + + // Map a tensor of ints on top of stack-allocated storage. + int storage[128]; // 2 x 4 x 2 x 8 = 128 + TensorMap> t_4d(storage, 2, 4, 2, 8); + + // The same storage can be viewed as a different tensor. + // You can also pass the sizes as an array. + TensorMap> t_2d(storage, 16, 8); + + // You can also map fixed-size tensors. Here we get a 1d view of + // the 2d fixed-size tensor. + TensorFixedSize> t_4x3; + TensorMap> t_12(t_4x3.data(), 12); + + +#### Class `TensorRef` + +See Assigning to a TensorRef below. + +## Accessing Tensor Elements + +#### ` tensor(index0, index1...)` + +Return the element at position `(index0, index1...)` in tensor +`tensor`. You must pass as many parameters as the rank of `tensor`. +The expression can be used as an l-value to set the value of the element at the +specified position. The value returned is of the datatype of the tensor. + + // Set the value of the element at position (0, 1, 0); + Tensor t_3d(2, 3, 4); + t_3d(0, 1, 0) = 12.0f; + + // Initialize all elements to random values. + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 4; ++k) { + t_3d(i, j, k) = ...some random value...; + } + } + } + + // Print elements of a tensor. + for (int i = 0; i < 2; ++i) { + LOG(INFO) << t_3d(i, 0, 0); + } + + +## TensorLayout + +The tensor library supports 2 layouts: `ColMajor` (the default) and +`RowMajor`. Only the default column major layout is currently fully +supported, and it is therefore not recommended to attempt to use the row major +layout at the moment. + +The layout of a tensor is optionally specified as part of its type. If not +specified explicitly column major is assumed. + + Tensor col_major; // equivalent to Tensor + TensorMap > row_major(data, ...); + +All the arguments to an expression must use the same layout. Attempting to mix +different layouts will result in a compilation error. + +It is possible to change the layout of a tensor or an expression using the +`swap_layout()` method. Note that this will also reverse the order of the +dimensions. + + Tensor col_major(2, 4); + Tensor row_major(2, 4); + + Tensor col_major_result = col_major; // ok, layouts match + Tensor col_major_result = row_major; // will not compile + + // Simple layout swap + col_major_result = row_major.swap_layout(); + eigen_assert(col_major_result.dimension(0) == 4); + eigen_assert(col_major_result.dimension(1) == 2); + + // Swap the layout and preserve the order of the dimensions + array shuffle(1, 0); + col_major_result = row_major.swap_layout().shuffle(shuffle); + eigen_assert(col_major_result.dimension(0) == 2); + eigen_assert(col_major_result.dimension(1) == 4); + + +## Tensor Operations + +The Eigen Tensor library provides a vast library of operations on Tensors: +numerical operations such as addition and multiplication, geometry operations +such as slicing and shuffling, etc. These operations are available as methods +of the Tensor classes, and in some cases as operator overloads. For example +the following code computes the elementwise addition of two tensors: + + Tensor t1(2, 3, 4); + ...set some values in t1... + Tensor t2(2, 3, 4); + ...set some values in t2... + // Set t3 to the element wise sum of t1 and t2 + Tensor t3 = t1 + t2; + +While the code above looks easy enough, it is important to understand that the +expression `t1 + t2` is not actually adding the values of the tensors. The +expression instead constructs a "tensor operator" object of the class +TensorCwiseBinaryOp, which has references to the tensors +`t1` and `t2`. This is a small C++ object that knows how to add +`t1` and `t2`. It is only when the value of the expression is assigned +to the tensor `t3` that the addition is actually performed. Technically, +this happens through the overloading of `operator=()` in the Tensor class. + +This mechanism for computing tensor expressions allows for lazy evaluation and +optimizations which are what make the tensor library very fast. + +Of course, the tensor operators do nest, and the expression `t1 + t2 * 0.3f` +is actually represented with the (approximate) tree of operators: + + TensorCwiseBinaryOp(t1, TensorCwiseUnaryOp(t2, 0.3f)) + + +### Tensor Operations and C++ "auto" + +Because Tensor operations create tensor operators, the C++ `auto` keyword +does not have its intuitive meaning. Consider these 2 lines of code: + + Tensor t3 = t1 + t2; + auto t4 = t1 + t2; + +In the first line we allocate the tensor `t3` and it will contain the +result of the addition of `t1` and `t2`. In the second line, `t4` +is actually the tree of tensor operators that will compute the addition of +`t1` and `t2`. In fact, `t4` is *not* a tensor and you cannot get +the values of its elements: + + Tensor t3 = t1 + t2; + cout << t3(0, 0, 0); // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0) + + auto t4 = t1 + t2; + cout << t4(0, 0, 0); // Compilation error! + +When you use `auto` you do not get a Tensor as a result but instead a +non-evaluated expression. So only use `auto` to delay evaluation. + +Unfortunately, there is no single underlying concrete type for holding +non-evaluated expressions, hence you have to use auto in the case when you do +want to hold non-evaluated expressions. + +When you need the results of set of tensor computations you have to assign the +result to a Tensor that will be capable of holding onto them. This can be +either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing +piece of memory. All the following will work: + + auto t4 = t1 + t2; + + Tensor result = t4; // Could also be: result(t4); + cout << result(0, 0, 0); + + TensorMap result(, , ...) = t4; + cout << result(0, 0, 0); + + TensorFixedSize> result = t4; + cout << result(0, 0, 0); + +Until you need the results, you can keep the operation around, and even reuse +it for additional operations. As long as you keep the expression as an +operation, no computation is performed. + + // One way to compute exp((t1 + t2) * 0.2f); + auto t3 = t1 + t2; + auto t4 = t3 * 0.2f; + auto t5 = t4.exp(); + Tensor result = t5; + + // Another way, exactly as efficient as the previous one: + Tensor result = ((t1 + t2) * 0.2f).exp(); + +### Controlling When Expression are Evaluated + +There are several ways to control when expressions are evaluated: + +* Assignment to a Tensor, TensorFixedSize, or TensorMap. +* Use of the eval() method. +* Assignment to a TensorRef. + +#### Assigning to a Tensor, TensorFixedSize, or TensorMap. + +The most common way to evaluate an expression is to assign it to a Tensor. In +the example below, the `auto` declarations make the intermediate values +"Operations", not Tensors, and do not cause the expressions to be evaluated. +The assignment to the Tensor `result` causes the evaluation of all the +operations. + + auto t3 = t1 + t2; // t3 is an Operation. + auto t4 = t3 * 0.2f; // t4 is an Operation. + auto t5 = t4.exp(); // t5 is an Operation. + Tensor result = t5; // The operations are evaluated. + +If you know the ranks and sizes of the Operation value you can assign the +Operation to a TensorFixedSize instead of a Tensor, which is a bit more +efficient. + + // We know that the result is a 4x4x2 tensor! + TensorFixedSize> result = t5; + +Simiarly, assigning an expression to a TensorMap causes its evaluation. Like +tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to +have the rank and sizes of the expression that are assigned to them. + +#### Calling `eval()`. + +When you compute large composite expressions, you sometimes want to tell Eigen +that an intermediate value in the expression tree is worth evaluating ahead of +time. This is done by inserting a call to the `eval()` method of the +expression Operation. + + // The previous example could have been written: + Tensor result = ((t1 + t2) * 0.2f).exp(); + + // If you want to compute (t1 + t2) once ahead of time you can write: + Tensor result = ((t1 + t2).eval() * 0.2f).exp(); + +Semantically, calling `eval()` is equivalent to materializing the value of +the expression in a temporary Tensor of the right size. The code above in +effect does: + + // .eval() knows the size! + TensorFixedSize> tmp = t1 + t2; + Tensor result = (tmp * 0.2f).exp(); + +Note that the return value of `eval()` is itself an Operation, so the +following code does not do what you may think: + + // Here t3 is an evaluation Operation. t3 has not been evaluated yet. + auto t3 = (t1 + t2).eval(); + + // You can use t3 in another expression. Still no evaluation. + auto t4 = (t3 * 0.2f).exp(); + + // The value is evaluated when you assign the Operation to a Tensor, using + // an intermediate tensor to represent t3.x + Tensor result = t4; + +While in the examples above calling `eval()` does not make a difference in +performance, in other cases it can make a huge difference. In the expression +below the `broadcast()` expression causes the `X.maximum()` expression +to be evaluated many times: + + Tensor<...> X ...; + Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast)) + * beta).exp(); + +Inserting a call to `eval()` between the `maximum()` and +`reshape()` calls guarantees that maximum() is only computed once and +greatly speeds-up execution: + + Tensor<...> Y = + ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) + * beta).exp(); + +In the other example below, the tensor `Y` is both used in the expression +and its assignment. This is an aliasing problem and if the evaluation is not +done in the right order Y will be updated incrementally during the evaluation +resulting in bogus results: + + Tensor<...> Y ...; + Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast)); + +Inserting a call to `eval()` between the `sum()` and `reshape()` +expressions ensures that the sum is computed before any updates to `Y` are +done. + + Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); + +Note that an eval around the full right hand side expression is not needed +because the generated has to compute the i-th value of the right hand side +before assigning it to the left hand side. + +However, if you were assigning the expression value to a shuffle of `Y` +then you would need to force an eval for correctness by adding an `eval()` +call for the right hand side: + + Y.shuffle(...) = + (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval(); + + +#### Assigning to a `TensorRef`. + +If you need to access only a few elements from the value of an expression you +can avoid materializing the value in a full tensor by using a TensorRef. + +A TensorRef is a small wrapper class for any Eigen Operation. It provides +overloads for the `()` operator that let you access individual values in +the expression. TensorRef is convenient, because the Operation themselves do +not provide a way to access individual elements. + + // Create a TensorRef for the expression. The expression is not + // evaluated yet. + TensorRef > ref = ((t1 + t2) * 0.2f).exp(); + + // Use "ref" to access individual elements. The expression is evaluated + // on the fly. + float at_0 = ref(0, 0, 0); + cout << ref(0, 1, 0); + +Only use TensorRef when you need a subset of the values of the expression. +TensorRef only computes the values you access. However note that if you are +going to access all the values it will be much faster to materialize the +results in a Tensor first. + +In some cases, if the full Tensor result would be very large, you may save +memory by accessing it as a TensorRef. But not always. So don't count on it. + + +### Controlling How Expressions Are Evaluated + +The tensor library provides several implementations of the various operations +such as contractions and convolutions. The implementations are optimized for +different environments: single threaded on CPU, multi threaded on CPU, or on a +GPU using cuda. Additional implementations may be added later. + +You can choose which implementation to use with the `device()` call. If +you do not choose an implementation explicitly the default implementation that +uses a single thread on the CPU is used. + +The default implementation has been optimized for recent Intel CPUs, taking +advantage of SSE, AVX, and FMA instructions. Work is ongoing to tune the +library on ARM CPUs. Note that you need to pass compiler-dependent flags +to enable the use of SSE, AVX, and other instructions. + +For example, the following code adds two tensors using the default +single-threaded CPU implementation: + + Tensor a(30, 40); + Tensor b(30, 40); + Tensor c = a + b; + +To choose a different implementation you have to insert a `device()` call +before the assignment of the result. For technical C++ reasons this requires +that the Tensor for the result be declared on its own. This means that you +have to know the size of the result. + + Eigen::Tensor c(30, 40); + c.device(...) = a + b; + +The call to `device()` must be the last call on the left of the operator=. + +You must pass to the `device()` call an Eigen device object. There are +presently three devices you can use: DefaultDevice, ThreadPoolDevice and +GpuDevice. + + +#### Evaluating With the DefaultDevice + +This is exactly the same as not inserting a `device()` call. + + DefaultDevice my_device; + c.device(my_device) = a + b; + +#### Evaluating with a Thread Pool + + // Create the Eigen ThreadPoolDevice. + Eigen::ThreadPoolDevice my_device(4 /* number of threads to use */); + + // Now just use the device when evaluating expressions. + Eigen::Tensor c(30, 50); + c.device(my_device) = a.contract(b, dot_product_dims); + + +#### Evaluating On GPU + +This is presently a bit more complicated than just using a thread pool device. +You need to create a GPU device but you also need to explicitly allocate the +memory for tensors with cuda. + + +## API Reference + +### Datatypes + +In the documentation of the tensor methods and Operation we mention datatypes +that are tensor-type specific: + +#### `::``Dimensions` + +Acts like an array of ints. Has an `int size` attribute, and can be +indexed like an array to access individual values. Used to represent the +dimensions of a tensor. See `dimensions()`. + +#### `::``Index` + +Acts like an `int`. Used for indexing tensors along their dimensions. See +`operator()`, `dimension()`, and `size()`. + +#### `::``Scalar` + +Represents the datatype of individual tensor elements. For example, for a +`Tensor`, `Scalar` is the type `float`. See +`setConstant()`. + +#### `` + +We use this pseudo type to indicate that a tensor Operation is returned by a +method. We indicate in the text the type and dimensions of the tensor that the +Operation returns after evaluation. + +The Operation will have to be evaluated, for example by assigning it to a +tensor, before you can access the values of the resulting tensor. You can also +access the values through a TensorRef. + + +## Built-in Tensor Methods + +These are usual C++ methods that act on tensors immediately. They are not +Operations which provide delayed evaluation of their results. Unless specified +otherwise, all the methods listed below are available on all tensor classes: +Tensor, TensorFixedSize, and TensorMap. + +## Metadata + +### `int NumDimensions` + +Constant value indicating the number of dimensions of a Tensor. This is also +known as the tensor "rank". + + Eigen::Tensor a(3, 4); + cout << "Dims " << a.NumDimensions; + => Dims 2 + +### `Dimensions dimensions()` + +Returns an array-like object representing the dimensions of the tensor. +The actual type of the `dimensions()` result is `::``Dimensions`. + + Eigen::Tensor a(3, 4); + const Eigen::Tensor::Dimensions& d = a.dimensions(); + cout << "Dim size: " << d.size << ", dim 0: " << d[0] + << ", dim 1: " << d[1]; + => Dim size: 2, dim 0: 3, dim 1: 4 + +If you use a C++11 compiler, you can use `auto` to simplify the code: + + const auto& d = a.dimensions(); + cout << "Dim size: " << d.size << ", dim 0: " << d[0] + << ", dim 1: " << d[1]; + => Dim size: 2, dim 0: 3, dim 1: 4 + +### `Index dimension(Index n)` + +Returns the n-th dimension of the tensor. The actual type of the +`dimension()` result is `::``Index`, but you can +always use it like an int. + + Eigen::Tensor a(3, 4); + int dim1 = a.dimension(1); + cout << "Dim 1: " << dim1; + => Dim 1: 4 + +### `Index size()` + +Returns the total number of elements in the tensor. This is the product of all +the tensor dimensions. The actual type of the `size()` result is +`::``Index`, but you can always use it like an int. + + Eigen::Tensor a(3, 4); + cout << "Size: " << a.size(); + => Size: 12 + + +### Getting Dimensions From An Operation + +A few operations provide `dimensions()` directly, +e.g. `TensorReslicingOp`. Most operations defer calculating dimensions +until the operation is being evaluated. If you need access to the dimensions +of a deferred operation, you can wrap it in a TensorRef (see Assigning to a +TensorRef above), which provides `dimensions()` and `dimension()` as +above. + +TensorRef can also wrap the plain Tensor types, so this is a useful idiom in +templated contexts where the underlying object could be either a raw Tensor +or some deferred operation (e.g. a slice of a Tensor). In this case, the +template code can wrap the object in a TensorRef and reason about its +dimensionality while remaining agnostic to the underlying type. + + +## Constructors + +### Tensor + +Creates a tensor of the specified size. The number of arguments must be equal +to the rank of the tensor. The content of the tensor is not initialized. + + Eigen::Tensor a(3, 4); + cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; + => NumRows: 3 NumCols: 4 + +### TensorFixedSize + +Creates a tensor of the specified size. The number of arguments in the Sizes<> +template parameter determines the rank of the tensor. The content of the tensor +is not initialized. + + Eigen::TensorFixedSize> a; + cout << "Rank: " << a.rank() << endl; + => Rank: 2 + cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; + => NumRows: 3 NumCols: 4 + +### TensorMap + +Creates a tensor mapping an existing array of data. The data must not be freed +until the TensorMap is discarded, and the size of the data must be large enough +to accommodate the coefficients of the tensor. + + float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + Eigen::TensorMap> a(data, 3, 4); + cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; + => NumRows: 3 NumCols: 4 + cout << "a(1, 2): " << a(1, 2) << endl; + => a(1, 2): 7 + + +## Contents Initialization + +When a new Tensor or a new TensorFixedSize are created, memory is allocated to +hold all the tensor elements, but the memory is not initialized. Similarly, +when a new TensorMap is created on top of non-initialized memory the memory its +contents are not initialized. + +You can use one of the methods below to initialize the tensor memory. These +have an immediate effect on the tensor and return the tensor itself as a +result. These are not tensor Operations which delay evaluation. + +### ` setConstant(const Scalar& val)` + +Sets all elements of the tensor to the constant value `val`. `Scalar` +is the type of data stored in the tensor. You can pass any value that is +convertible to that type. + +Returns the tensor itself in case you want to chain another call. + + a.setConstant(12.3f); + cout << "Constant: " << endl << a << endl << endl; + => + Constant: + 12.3 12.3 12.3 12.3 + 12.3 12.3 12.3 12.3 + 12.3 12.3 12.3 12.3 + +Note that `setConstant()` can be used on any tensor where the element type +has a copy constructor and an `operator=()`: + + Eigen::Tensor a(2, 3); + a.setConstant("yolo"); + cout << "String tensor: " << endl << a << endl << endl; + => + String tensor: + yolo yolo yolo + yolo yolo yolo + + +### ` setZero()` + +Fills the tensor with zeros. Equivalent to `setConstant(Scalar(0))`. +Returns the tensor itself in case you want to chain another call. + + a.setZero(); + cout << "Zeros: " << endl << a << endl << endl; + => + Zeros: + 0 0 0 0 + 0 0 0 0 + 0 0 0 0 + + +### ` setValues({..initializer_list})` + +Fills the tensor with explicit values specified in a std::initializer_list. +The type of the initializer list depends on the type and rank of the tensor. + +If the tensor has rank N, the initializer list must be nested N times. The +most deeply nested lists must contains P scalars of the Tensor type where P is +the size of the last dimension of the Tensor. + +For example, for a `TensorFixedSize` the initializer list must +contains 2 lists of 3 floats each. + +`setValues()` returns the tensor itself in case you want to chain another +call. + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}}); + cout << "a" << endl << a << endl << endl; + => + a + 0 1 2 + 3 4 5 + +If a list is too short, the corresponding elements of the tensor will not be +changed. This is valid at each level of nesting. For example the following +code only sets the values of the first row of the tensor. + + Eigen::Tensor a(2, 3); + a.setConstant(1000); + a.setValues({{10, 20, 30}}); + cout << "a" << endl << a << endl << endl; + => + a + 10 20 30 + 1000 1000 1000 + +### ` setRandom()` + +Fills the tensor with random values. Returns the tensor itself in case you +want to chain another call. + + a.setRandom(); + cout << "Random: " << endl << a << endl << endl; + => + Random: + 0.680375 0.59688 -0.329554 0.10794 + -0.211234 0.823295 0.536459 -0.0452059 + 0.566198 -0.604897 -0.444451 0.257742 + +You can customize `setRandom()` by providing your own random number +generator as a template argument: + + a.setRandom(); + +Here, `MyRandomGenerator` must be a struct with the following member +functions, where Scalar and Index are the same as `::``Scalar` +and `::``Index`. + +See `struct UniformRandomGenerator` in TensorFunctors.h for an example. + + // Custom number generator for use with setRandom(). + struct MyRandomGenerator { + // Default and copy constructors. Both are needed + MyRandomGenerator() { } + MyRandomGenerator(const MyRandomGenerator& ) { } + + // Return a random value to be used. "element_location" is the + // location of the entry to set in the tensor, it can typically + // be ignored. + Scalar operator()(Eigen::DenseIndex element_location, + Eigen::DenseIndex /*unused*/ = 0) const { + return ; + } + + // Same as above but generates several numbers at a time. + typename internal::packet_traits::type packetOp( + Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const { + return ; + } + }; + +You can also use one of the 2 random number generators that are part of the +tensor library: +* UniformRandomGenerator +* NormalRandomGenerator + + +## Data Access + +The Tensor, TensorFixedSize, and TensorRef classes provide the following +accessors to access the tensor coefficients: + + const Scalar& operator()(const array& indices) + const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + Scalar& operator()(const array& indices) + Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + +The number of indices must be equal to the rank of the tensor. Moreover, these +accessors are not available on tensor expressions. In order to access the +values of a tensor expression, the expression must either be evaluated or +wrapped in a TensorRef. + + +### `Scalar* data()` and `const Scalar* data() const` + +Returns a pointer to the storage for the tensor. The pointer is const if the +tensor was const. This allows direct access to the data. The layout of the +data depends on the tensor layout: RowMajor or ColMajor. + +This access is usually only needed for special cases, for example when mixing +Eigen Tensor code with other libraries. + +Scalar is the type of data stored in the tensor. + + Eigen::Tensor a(3, 4); + float* a_data = a.data(); + a_data[0] = 123.45f; + cout << "a(0, 0): " << a(0, 0); + => a(0, 0): 123.45 + + +## Tensor Operations + +All the methods documented below return non evaluated tensor `Operations`. +These can be chained: you can apply another Tensor Operation to the value +returned by the method. + +The chain of Operation is evaluated lazily, typically when it is assigned to a +tensor. See "Controlling when Expression are Evaluated" for more details about +their evaluation. + +### ` constant(const Scalar& val)` + +Returns a tensor of the same type and dimensions as the original tensor but +where all elements have the value `val`. + +This is useful, for example, when you want to add or subtract a constant from a +tensor, or multiply every element of a tensor by a scalar. + + Eigen::Tensor a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor b = a + a.constant(2.0f); + Eigen::Tensor c = b * b.constant(0.2f); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + cout << "c" << endl << c << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + 3 3 3 + 3 3 3 + + c + 0.6 0.6 0.6 + 0.6 0.6 0.6 + +### ` random()` + +Returns a tensor of the same type and dimensions as the current tensor +but where all elements have random values. + +This is for example useful to add random values to an existing tensor. +The generation of random values can be customized in the same manner +as for `setRandom()`. + + Eigen::Tensor a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor b = a + a.random(); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + 1.68038 1.5662 1.82329 + 0.788766 1.59688 0.395103 + + +## Unary Element Wise Operations + +All these operations take a single input tensor as argument and return a tensor +of the same type and dimensions as the tensor to which they are applied. The +requested operations are applied to each element independently. + +### ` operator-()` + +Returns a tensor of the same type and dimensions as the original tensor +containing the opposite values of the original tensor. + + Eigen::Tensor a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor b = -a; + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + -1 -1 -1 + -1 -1 -1 + +### ` sqrt()` + +Returns a tensor of the same type and dimensions as the original tensor +containing the square roots of the original tensor. + +### ` rsqrt()` + +Returns a tensor of the same type and dimensions as the original tensor +containing the inverse square roots of the original tensor. + +### ` square()` + +Returns a tensor of the same type and dimensions as the original tensor +containing the squares of the original tensor values. + +### ` inverse()` + +Returns a tensor of the same type and dimensions as the original tensor +containing the inverse of the original tensor values. + +### ` exp()` + +Returns a tensor of the same type and dimensions as the original tensor +containing the exponential of the original tensor. + +### ` log()` + +Returns a tensor of the same type and dimensions as the original tensor +containing the natural logarithms of the original tensor. + +### ` abs()` + +Returns a tensor of the same type and dimensions as the original tensor +containing the absolute values of the original tensor. + +### ` pow(Scalar exponent)` + +Returns a tensor of the same type and dimensions as the original tensor +containing the coefficients of the original tensor to the power of the +exponent. + +The type of the exponent, Scalar, is always the same as the type of the +tensor coefficients. For example, only integer exponents can be used in +conjuntion with tensors of integer values. + +You can use cast() to lift this restriction. For example this computes +cubic roots of an int Tensor: + + Eigen::Tensor a(2, 3); + a.setValues({{0, 1, 8}, {27, 64, 125}}); + Eigen::Tensor b = a.cast().pow(1.0 / 3.0); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 0 1 8 + 27 64 125 + + b + 0 1 2 + 3 4 5 + +### ` operator * (Scalar scale)` + +Multiplies all the coefficients of the input tensor by the provided scale. + +### ` cwiseMax(Scalar threshold)` +TODO + +### ` cwiseMin(Scalar threshold)` +TODO + +### ` unaryExpr(const CustomUnaryOp& func)` +TODO + + +## Binary Element Wise Operations + +These operations take two input tensors as arguments. The 2 input tensors should +be of the same type and dimensions. The result is a tensor of the same +dimensions as the tensors to which they are applied, and unless otherwise +specified it is also of the same type. The requested operations are applied to +each pair of elements independently. + +### ` operator+(const OtherDerived& other)` + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise sums of the inputs. + +### ` operator-(const OtherDerived& other)` + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise differences of the inputs. + +### ` operator*(const OtherDerived& other)` + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise products of the inputs. + +### ` operator/(const OtherDerived& other)` + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise quotients of the inputs. + +This operator is not supported for integer types. + +### ` cwiseMax(const OtherDerived& other)` + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise maximums of the inputs. + +### ` cwiseMin(const OtherDerived& other)` + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise mimimums of the inputs. + +### ` Logical operators` + +The following logical operators are supported as well: + +* operator&&(const OtherDerived& other) +* operator||(const OtherDerived& other) +* operator<(const OtherDerived& other) +* operator<=(const OtherDerived& other) +* operator>(const OtherDerived& other) +* operator>=(const OtherDerived& other) +* operator==(const OtherDerived& other) +* operator!=(const OtherDerived& other) + +They all return a tensor of boolean values. + + +## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) + +Selection is a coefficient-wise ternary operator that is the tensor equivalent +to the if-then-else operation. + + Tensor if = ...; + Tensor then = ...; + Tensor else = ...; + Tensor result = if.select(then, else); + +The 3 arguments must be of the same dimensions, which will also be the dimension +of the result. The 'if' tensor must be of type boolean, the 'then' and the +'else' tensor must be of the same type, which will also be the type of the +result. + +Each coefficient in the result is equal to the corresponding coefficient in the +'then' tensor if the corresponding value in the 'if' tensor is true. If not, the +resulting coefficient will come from the 'else' tensor. + + +## Contraction + +Tensor *contractions* are a generalization of the matrix product to the +multidimensional case. + + // Create 2 matrices using tensors of rank 2 + Eigen::Tensor a(2, 3); + a.setValues({{1, 2, 3}, {6, 5, 4}}); + Eigen::Tensor b(3, 2); + b.setValues({{1, 2}, {4, 5}, {5, 6}}); + + // Compute the traditional matrix product + Eigen::array, 1> product_dims = { Eigen::IndexPair(1, 0) }; + Eigen::Tensor AB = a.contract(b, product_dims); + + // Compute the product of the transpose of the matrices + Eigen::array, 1> transposed_product_dims = { Eigen::IndexPair(0, 1) }; + Eigen::Tensor AtBt = a.contract(b, transposed_product_dims); + + // Contraction to scalar value using a double contraction. + // First coordinate of both tensors are contracted as well as both second coordinates, i.e., this computes the sum of the squares of the elements. + Eigen::array, 2> double_contraction_product_dims = { Eigen::IndexPair(0, 0), Eigen::IndexPair(1, 1) }; + Eigen::Tensor AdoubleContractedA = a.contract(a, double_contraction_product_dims); + + // Extracting the scalar value of the tensor contraction for further usage + int value = AdoubleContractedA(0); + +## Reduction Operations + +A *Reduction* operation returns a tensor with fewer dimensions than the +original tensor. The values in the returned tensor are computed by applying a +*reduction operator* to slices of values from the original tensor. You specify +the dimensions along which the slices are made. + +The Eigen Tensor library provides a set of predefined reduction operators such +as `maximum()` and `sum()` and lets you define additional operators by +implementing a few methods from a reductor template. + +### Reduction Dimensions + +All reduction operations take a single parameter of type +`::``Dimensions` which can always be specified as an array of +ints. These are called the "reduction dimensions." The values are the indices +of the dimensions of the input tensor over which the reduction is done. The +parameter can have at most as many element as the rank of the input tensor; +each element must be less than the tensor rank, as it indicates one of the +dimensions to reduce. + +Each dimension of the input tensor should occur at most once in the reduction +dimensions as the implementation does not remove duplicates. + +The order of the values in the reduction dimensions does not affect the +results, but the code may execute faster if you list the dimensions in +increasing order. + +Example: Reduction along one dimension. + + // Create a tensor of 2 dimensions + Eigen::Tensor a(2, 3); + a.setValues({{1, 2, 3}, {6, 5, 4}}); + // Reduce it along the second dimension (1)... + Eigen::array dims({1 /* dimension to reduce */}); + // ...using the "maximum" operator. + // The result is a tensor with one dimension. The size of + // that dimension is the same as the first (non-reduced) dimension of a. + Eigen::Tensor b = a.maximum(dims); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 2 3 + 6 5 4 + + b + 3 + 6 + +Example: Reduction along two dimensions. + + Eigen::Tensor a(2, 3, 4); + a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, + {7.0f, 6.0f, 5.0f, 4.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}, + {{12.0f, 13.0f, 14.0f, 15.0f}, + {19.0f, 18.0f, 17.0f, 16.0f}, + {20.0f, 21.0f, 22.0f, 23.0f}}}); + // The tensor a has 3 dimensions. We reduce along the + // first 2, resulting in a tensor with a single dimension + // of size 4 (the last dimension of a.) + // Note that we pass the array of reduction dimensions + // directly to the maximum() call. + Eigen::Tensor b = + a.maximum(Eigen::array({0, 1})); + cout << "b" << endl << b << endl << endl; + => + b + 20 + 21 + 22 + 23 + +#### Reduction along all dimensions + +As a special case, if you pass no parameter to a reduction operation the +original tensor is reduced along *all* its dimensions. The result is a +scalar, represented as a zero-dimension tensor. + + Eigen::Tensor a(2, 3, 4); + a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, + {7.0f, 6.0f, 5.0f, 4.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}, + {{12.0f, 13.0f, 14.0f, 15.0f}, + {19.0f, 18.0f, 17.0f, 16.0f}, + {20.0f, 21.0f, 22.0f, 23.0f}}}); + // Reduce along all dimensions using the sum() operator. + Eigen::Tensor b = a.sum(); + cout << "b" << endl << b << endl << endl; + => + b + 276 + + +### ` sum(const Dimensions& new_dims)` +### ` sum()` + +Reduce a tensor using the sum() operator. The resulting values +are the sum of the reduced values. + +### ` mean(const Dimensions& new_dims)` +### ` mean()` + +Reduce a tensor using the mean() operator. The resulting values +are the mean of the reduced values. + +### ` maximum(const Dimensions& new_dims)` +### ` maximum()` + +Reduce a tensor using the maximum() operator. The resulting values are the +largest of the reduced values. + +### ` minimum(const Dimensions& new_dims)` +### ` minimum()` + +Reduce a tensor using the minimum() operator. The resulting values +are the smallest of the reduced values. + +### ` prod(const Dimensions& new_dims)` +### ` prod()` + +Reduce a tensor using the prod() operator. The resulting values +are the product of the reduced values. + +### ` all(const Dimensions& new_dims)` +### ` all()` +Reduce a tensor using the all() operator. Casts tensor to bool and then checks +whether all elements are true. Runs through all elements rather than +short-circuiting, so may be significantly inefficient. + +### ` any(const Dimensions& new_dims)` +### ` any()` +Reduce a tensor using the any() operator. Casts tensor to bool and then checks +whether any element is true. Runs through all elements rather than +short-circuiting, so may be significantly inefficient. + + +### ` reduce(const Dimensions& new_dims, const Reducer& reducer)` + +Reduce a tensor using a user-defined reduction operator. See `SumReducer` +in TensorFunctors.h for information on how to implement a reduction operator. + + +## Scan Operations + +A *Scan* operation returns a tensor with the same dimensions as the original +tensor. The operation performs an inclusive scan along the specified +axis, which means it computes a running total along the axis for a given +reduction operation. +If the reduction operation corresponds to summation, then this computes the +prefix sum of the tensor along the given axis. + +Example: +dd a comment to this line + + // Create a tensor of 2 dimensions + Eigen::Tensor a(2, 3); + a.setValues({{1, 2, 3}, {4, 5, 6}}); + // Scan it along the second dimension (1) using summation + Eigen::Tensor b = a.cumsum(1); + // The result is a tensor with the same size as the input + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 2 3 + 4 5 6 + + b + 1 3 6 + 4 9 15 + +### ` cumsum(const Index& axis)` + +Perform a scan by summing consecutive entries. + +### ` cumprod(const Index& axis)` + +Perform a scan by multiplying consecutive entries. + + +## Convolutions + +### ` convolve(const Kernel& kernel, const Dimensions& dims)` + +Returns a tensor that is the output of the convolution of the input tensor with the kernel, +along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor +which were part of the convolution will be reduced by the formula: +output_dim_size = input_dim_size - kernel_dim_size + 1 (requires: input_dim_size >= kernel_dim_size). +The dimension sizes for dimensions that were not part of the convolution will remain the same. +Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the +convolution is computed (the first dimension has the shortest stride for ColMajor, whereas RowMajor's shortest stride is +for the last dimension). + + // Compute convolution along the second and third dimension. + Tensor input(3, 3, 7, 11); + Tensor kernel(2, 2); + Tensor output(3, 2, 6, 11); + input.setRandom(); + kernel.setRandom(); + + Eigen::array dims({1, 2}); // Specify second and third dimension for convolution. + output = input.convolve(kernel, dims); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 6; ++k) { + for (int l = 0; l < 11; ++l) { + const float result = output(i,j,k,l); + const float expected = input(i,j+0,k+0,l) * kernel(0,0) + + input(i,j+1,k+0,l) * kernel(1,0) + + input(i,j+0,k+1,l) * kernel(0,1) + + input(i,j+1,k+1,l) * kernel(1,1); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + + +## Geometrical Operations + +These operations return a Tensor with different dimensions than the original +Tensor. They can be used to access slices of tensors, see them with different +dimensions, or pad tensors with additional data. + +### ` reshape(const Dimensions& new_dims)` + +Returns a view of the input tensor that has been reshaped to the specified +new dimensions. The argument new_dims is an array of Index values. The +rank of the resulting tensor is equal to the number of elements in new_dims. + +The product of all the sizes in the new dimension array must be equal to +the number of elements in the input tensor. + + // Increase the rank of the input tensor by introducing a new dimension + // of size 1. + Tensor input(7, 11); + array three_dims{{7, 11, 1}}; + Tensor result = input.reshape(three_dims); + + // Decrease the rank of the input tensor by merging 2 dimensions; + array one_dim{{7 * 11}}; + Tensor result = input.reshape(one_dim); + +This operation does not move any data in the input tensor, so the resulting +contents of a reshaped Tensor depend on the data layout of the original Tensor. + +For example this is what happens when you `reshape()` a 2D ColMajor tensor +to one dimension: + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array one_dim({3 * 2}); + Eigen::Tensor b = a.reshape(one_dim); + cout << "b" << endl << b << endl; + => + b + 0 + 300 + 100 + 400 + 200 + 500 + +This is what happens when the 2D Tensor is RowMajor: + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array one_dim({3 * 2}); + Eigen::Tensor b = a.reshape(one_dim); + cout << "b" << endl << b << endl; + => + b + 0 + 100 + 200 + 300 + 400 + 500 + +The reshape operation is a lvalue. In other words, it can be used on the left +side of the assignment operator. + +The previous example can be rewritten as follow: + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array two_dim({2, 3}); + Eigen::Tensor b(6); + b.reshape(two_dim) = a; + cout << "b" << endl << b << endl; + => + b + 0 + 300 + 100 + 400 + 200 + 500 + +Note that "b" itself was not reshaped but that instead the assignment is done to +the reshape view of b. + + +### ` shuffle(const Shuffle& shuffle)` + +Returns a copy of the input tensor whose dimensions have been +reordered according to the specified permutation. The argument shuffle +is an array of Index values. Its size is the rank of the input +tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th +dimension of the output tensor equals to the size of the shuffle[i]-th +dimension of the input tensor. For example: + + // Shuffle all dimensions to the left by 1. + Tensor input(20, 30, 50); + // ... set some values in input. + Tensor output = input.shuffle({1, 2, 0}) + + eigen_assert(output.dimension(0) == 30); + eigen_assert(output.dimension(1) == 50); + eigen_assert(output.dimension(2) == 20); + +Indices into the output tensor are shuffled accordingly to formulate +indices into the input tensor. For example, one can assert in the above +code snippet that: + + eigen_assert(output(3, 7, 11) == input(11, 3, 7)); + +In general, one can assert that + + eigen_assert(output(..., indices[shuffle[i]], ...) == + input(..., indices[i], ...)) + +The shuffle operation results in a lvalue, which means that it can be assigned +to. In other words, it can be used on the left side of the assignment operator. + +Let's rewrite the previous example to take advantage of this feature: + + // Shuffle all dimensions to the left by 1. + Tensor input(20, 30, 50); + // ... set some values in input. + Tensor output(30, 50, 20); + output.shuffle({2, 0, 1}) = input; + + +### ` stride(const Strides& strides)` + +Returns a view of the input tensor that strides (skips stride-1 +elements) along each of the dimensions. The argument strides is an +array of Index values. The dimensions of the resulting tensor are +ceil(input_dimensions[i] / strides[i]). + +For example this is what happens when you `stride()` a 2D tensor: + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array strides({3, 2}); + Eigen::Tensor b = a.stride(strides); + cout << "b" << endl << b << endl; + => + b + 0 200 + 900 1100 + +It is possible to assign a tensor to a stride: + Tensor input(20, 30, 50); + // ... set some values in input. + Tensor output(40, 90, 200); + output.stride({2, 3, 4}) = input; + + +### ` slice(const StartIndices& offsets, const Sizes& extents)` + +Returns a sub-tensor of the given tensor. For each dimension i, the slice is +made of the coefficients stored between offset[i] and offset[i] + extents[i] in +the input tensor. + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array offsets = {1, 0}; + Eigen::array extents = {2, 2}; + Eigen::Tensor slice = a.slice(offsets, extents); + cout << "a" << endl << a << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + cout << "slice" << endl << slice << endl; + => + slice + 300 400 + 600 700 + + +### ` chip(const Index offset, const Index dim)` + +A chip is a special kind of slice. It is the subtensor at the given offset in +the dimension dim. The returned tensor has one fewer dimension than the input +tensor: the dimension dim is removed. + +For example, a matrix chip would be either a row or a column of the input +matrix. + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::Tensor row_3 = a.chip(2, 0); + Eigen::Tensor col_2 = a.chip(1, 1); + cout << "a" << endl << a << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + cout << "row_3" << endl << row_3 << endl; + => + row_3 + 600 700 800 + cout << "col_2" << endl << col_2 << endl; + => + col_2 + 100 400 700 1000 + +It is possible to assign values to a tensor chip since the chip operation is a +lvalue. For example: + + Eigen::Tensor a(3); + a.setValues({{100, 200, 300}}); + Eigen::Tensor b(2, 3); + b.setZero(); + b.chip(0, 0) = a; + cout << "a" << endl << a << endl; + => + a + 100 + 200 + 300 + cout << "b" << endl << b << endl; + => + b + 100 200 300 + 0 0 0 + + +### ` reverse(const ReverseDimensions& reverse)` + +Returns a view of the input tensor that reverses the order of the coefficients +along a subset of the dimensions. The argument reverse is an array of boolean +values that indicates whether or not the order of the coefficients should be +reversed along each of the dimensions. This operation preserves the dimensions +of the input tensor. + +For example this is what happens when you `reverse()` the first dimension +of a 2D tensor: + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array reverse({true, false}); + Eigen::Tensor b = a.reverse(reverse); + cout << "a" << endl << a << endl << "b" << endl << b << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + b + 900 1000 1100 + 600 700 800 + 300 400 500 + 0 100 200 + + +### ` broadcast(const Broadcast& broadcast)` + +Returns a view of the input tensor in which the input is replicated one to many +times. +The broadcast argument specifies how many copies of the input tensor need to be +made in each of the dimensions. + + Eigen::Tensor a(2, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}}); + Eigen::array bcast({3, 2}); + Eigen::Tensor b = a.broadcast(bcast); + cout << "a" << endl << a << endl << "b" << endl << b << endl; + => + a + 0 100 200 + 300 400 500 + b + 0 100 200 0 100 200 + 300 400 500 300 400 500 + 0 100 200 0 100 200 + 300 400 500 300 400 500 + 0 100 200 0 100 200 + 300 400 500 300 400 500 + +### ` concatenate(const OtherDerived& other, Axis axis)` + +TODO + +### ` pad(const PaddingDimensions& padding)` + +Returns a view of the input tensor in which the input is padded with zeros. + + Eigen::Tensor a(2, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}}); + Eigen::array, 2> paddings; + paddings[0] = make_pair(0, 1); + paddings[1] = make_pair(2, 3); + Eigen::Tensor b = a.pad(paddings); + cout << "a" << endl << a << endl << "b" << endl << b << endl; + => + a + 0 100 200 + 300 400 500 + b + 0 0 0 0 + 0 0 0 0 + 0 100 200 0 + 300 400 500 0 + 0 0 0 0 + 0 0 0 0 + 0 0 0 0 + + +### ` extract_patches(const PatchDims& patch_dims)` + +Returns a tensor of coefficient patches extracted from the input tensor, where +each patch is of dimension specified by 'patch_dims'. The returned tensor has +one greater dimension than the input tensor, which is used to index each patch. +The patch index in the output tensor depends on the data layout of the input +tensor: the patch index is the last dimension ColMajor layout, and the first +dimension in RowMajor layout. + +For example, given the following input tensor: + + Eigen::Tensor tensor(3,4); + tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f}, + {4.0f, 5.0f, 6.0f, 7.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}); + + cout << "tensor: " << endl << tensor << endl; +=> +tensor: + 0 1 2 3 + 4 5 6 7 + 8 9 10 11 + +Six 2x2 patches can be extracted and indexed using the following code: + + Eigen::Tensor patch; + Eigen::array patch_dims; + patch_dims[0] = 2; + patch_dims[1] = 2; + patch = tensor.extract_patches(patch_dims); + for (int k = 0; k < 6; ++k) { + cout << "patch index: " << k << endl; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + if (DataLayout == ColMajor) { + cout << patch(i, j, k) << " "; + } else { + cout << patch(k, i, j) << " "; + } + } + cout << endl; + } + } + +This code results in the following output when the data layout is ColMajor: + +patch index: 0 +0 1 +4 5 +patch index: 1 +4 5 +8 9 +patch index: 2 +1 2 +5 6 +patch index: 3 +5 6 +9 10 +patch index: 4 +2 3 +6 7 +patch index: 5 +6 7 +10 11 + +This code results in the following output when the data layout is RowMajor: +(NOTE: the set of patches is the same as in ColMajor, but are indexed differently). + +patch index: 0 +0 1 +4 5 +patch index: 1 +1 2 +5 6 +patch index: 2 +2 3 +6 7 +patch index: 3 +4 5 +8 9 +patch index: 4 +5 6 +9 10 +patch index: 5 +6 7 +10 11 + +### ` extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)` + +Returns a tensor of coefficient image patches extracted from the input tensor, +which is expected to have dimensions ordered as follows (depending on the data +layout of the input tensor, and the number of additional dimensions 'N'): + +*) ColMajor +1st dimension: channels (of size d) +2nd dimension: rows (of size r) +3rd dimension: columns (of size c) +4th-Nth dimension: time (for video) or batch (for bulk processing). + +*) RowMajor (reverse order of ColMajor) +1st-Nth dimension: time (for video) or batch (for bulk processing). +N+1'th dimension: columns (of size c) +N+2'th dimension: rows (of size r) +N+3'th dimension: channels (of size d) + +The returned tensor has one greater dimension than the input tensor, which is +used to index each patch. The patch index in the output tensor depends on the +data layout of the input tensor: the patch index is the 4'th dimension in +ColMajor layout, and the 4'th from the last dimension in RowMajor layout. + +For example, given the following input tensor with the following dimension +sizes: + *) depth: 2 + *) rows: 3 + *) columns: 5 + *) batch: 7 + + Tensor tensor(2,3,5,7); + Tensor tensor_row_major = tensor.swap_layout(); + +2x2 image patches can be extracted and indexed using the following code: + +*) 2D patch: ColMajor (patch indexed by second-to-last dimension) + Tensor twod_patch; + twod_patch = tensor.extract_image_patches<2, 2>(); + // twod_patch.dimension(0) == 2 + // twod_patch.dimension(1) == 2 + // twod_patch.dimension(2) == 2 + // twod_patch.dimension(3) == 3*5 + // twod_patch.dimension(4) == 7 + +*) 2D patch: RowMajor (patch indexed by the second dimension) + Tensor twod_patch_row_major; + twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>(); + // twod_patch_row_major.dimension(0) == 7 + // twod_patch_row_major.dimension(1) == 3*5 + // twod_patch_row_major.dimension(2) == 2 + // twod_patch_row_major.dimension(3) == 2 + // twod_patch_row_major.dimension(4) == 2 + +## Special Operations + +### ` cast()` + +Returns a tensor of type T with the same dimensions as the original tensor. +The returned tensor contains the values of the original tensor converted to +type T. + + Eigen::Tensor a(2, 3); + Eigen::Tensor b = a.cast(); + +This can be useful for example if you need to do element-wise division of +Tensors of integers. This is not currently supported by the Tensor library +but you can easily cast the tensors to floats to do the division: + + Eigen::Tensor a(2, 3); + a.setValues({{0, 1, 2}, {3, 4, 5}}); + Eigen::Tensor b = + (a.cast() / a.constant(2).cast()).cast(); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 0 1 2 + 3 4 5 + + b + 0 0 1 + 1 2 2 + + +### ` eval()` + +TODO + + +## Representation of scalar values + +Scalar values are often represented by tensors of size 1 and rank 0.For example +Tensor::maximum() currently returns a Tensor. Similarly, the inner +product of 2 1d tensors (through contractions) returns a 0d tensor. + +## Limitations + +* The number of tensor dimensions is currently limited to 250 when using a + compiler that supports cxx11. It is limited to only 5 for older compilers. +* The IndexList class requires a cxx11 compliant compiler. You can use an + array of indices instead if you don't have access to a modern compiler. +* On GPUs only floating point values are properly tested and optimized for. +* Complex and integer values are known to be broken on GPUs. If you try to use + them you'll most likely end up triggering a static assertion failure such as + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + + diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/Tensor.h new file mode 100644 index 00000000..e44d51ec --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -0,0 +1,527 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_H + +namespace Eigen { + +/** \class Tensor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor class. + * + * The %Tensor class is the work-horse for all \em dense tensors within Eigen. + * + * The %Tensor class encompasses only dynamic-size objects so far. + * + * The first two template parameters are required: + * \tparam Scalar_ Numeric type, e.g. float, double, int or `std::complex`. + * User defined scalar types are supported as well (see \ref user_defined_scalars "here"). + * \tparam NumIndices_ Number of indices (i.e. rank of the tensor) + * + * The remaining template parameters are optional -- in most cases you don't have to worry about them. + * \tparam Options_ A combination of either \b #RowMajor or \b #ColMajor, and of either + * \b #AutoAlign or \b #DontAlign. + * The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required + * for vectorization. It defaults to aligning tensors. Note that tensors currently do not support any operations that profit from vectorization. + * Support for such operations (i.e. adding two tensors etc.) is planned. + * + * You can access elements of tensors using normal subscripting: + * + * \code + * Eigen::Tensor t(10, 10, 10, 10); + * t(0, 1, 2, 3) = 42.0; + * \endcode + * + * This class can be extended with the help of the plugin mechanism described on the page + * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN. + * + * Some notes: + * + *
+ *
Relation to other parts of Eigen:
+ *
The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that + * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code + * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor + * class does not provide any of these features and is only available as a stand-alone class that just allows for + * coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to + * change dramatically.
+ *
+ * + * \ref TopicStorageOrders + */ + +template +class Tensor : public TensorBase > +{ + public: + typedef Tensor Self; + typedef TensorBase > Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef Scalar_ Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + enum { + IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign), + Layout = Options_ & RowMajor ? RowMajor : ColMajor, + CoordAccess = true, + RawAccess = true + }; + + static const int Options = Options_; + static const int NumIndices = NumIndices_; + typedef DSizes Dimensions; + + protected: + TensorStorage m_storage; + +#ifdef EIGEN_HAS_SFINAE + template + struct isOfNormalIndex{ + static const bool is_array = internal::is_base_of, CustomIndices>::value; + static const bool is_int = NumTraits::IsInteger; + static const bool value = is_array | is_int; + }; +#endif + + public: + // Metadata + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + + // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // work, because that uses base().coeffRef() - and we don't yet + // implement a similar class hierarchy + inline Self& base() { return *this; } + inline const Self& base() const { return *this; } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array{{firstIndex, secondIndex, otherIndices...}}); + } +#endif + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + // custom indices +#ifdef EIGEN_HAS_SFINAE + template::value) ) + > + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const + { + return coeff(internal::customIndices2Array(indices)); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array{{firstIndex, secondIndex, otherIndices...}}); + } +#endif + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + // custom indices +#ifdef EIGEN_HAS_SFINAE + template::value) ) + > + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices) + { + return coeffRef(internal::customIndices2Array(indices)); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array{{firstIndex, secondIndex, otherIndices...}}); + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + { + return coeff(array(i0, i1)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + { + return coeff(array(i0, i1, i2)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + { + return coeff(array(i0, i1, i2, i3)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + return coeff(array(i0, i1, i2, i3, i4)); + } +#endif + + // custom indices +#ifdef EIGEN_HAS_SFINAE + template::value) ) + > + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const + { + return coeff(internal::customIndices2Array(indices)); + } +#endif + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const + { + return coeff(indices); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return coeff(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const + { + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(index); + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array{{firstIndex, secondIndex, otherIndices...}}); + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + { + return coeffRef(array(i0, i1)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + { + return coeffRef(array(i0, i1, i2)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + return coeffRef(array(i0, i1, i2, i3)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + return coeffRef(array(i0, i1, i2, i3, i4)); + } +#endif + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + { + return coeffRef(indices); + } + + // custom indices +#ifdef EIGEN_HAS_SFINAE + template::value) ) + > + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices) + { + return coeffRef(internal::customIndices2Array(indices)); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeffRef(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) + { + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor() + : m_storage() + { + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const Self& other) + : m_storage(other.m_storage) + { + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions) + : m_storage(firstDimension, otherDimensions...) + { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#else + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1) + : m_storage(dim1, array(dim1)) + { + EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2) + : m_storage(dim1*dim2, array(dim1, dim2)) + { + EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3) + : m_storage(dim1*dim2*dim3, array(dim1, dim2, dim3)) + { + EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4) + : m_storage(dim1*dim2*dim3*dim4, array(dim1, dim2, dim3, dim4)) + { + EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) + : m_storage(dim1*dim2*dim3*dim4*dim5, array(dim1, dim2, dim3, dim4, dim5)) + { + EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#endif + + /** Normal Dimension */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array& dimensions) + : m_storage(internal::array_prod(dimensions), dimensions) + { + EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + void resize(Index firstDimension, IndexTypes... otherDimensions) + { + // The number of dimensions used to resize a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + resize(array{{firstDimension, otherDimensions...}}); + } +#endif + + /** Normal Dimension */ + EIGEN_DEVICE_FUNC void resize(const array& dimensions) + { + int i; + Index size = Index(1); + for (i = 0; i < NumIndices; i++) { + internal::check_rows_cols_for_overflow::run(size, dimensions[i]); + size *= dimensions[i]; + } + #ifdef EIGEN_INITIALIZE_COEFFS + bool size_changed = size != this->size(); + m_storage.resize(size, dimensions); + if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + #else + m_storage.resize(size, dimensions); + #endif + } + + // Why this overload, DSizes is derived from array ??? // + EIGEN_DEVICE_FUNC void resize(const DSizes& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = dimensions[i]; + } + resize(dims); + } + + EIGEN_DEVICE_FUNC + void resize() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + // Nothing to do: rank 0 tensors have fixed size + } + + /** Custom Dimension */ +#ifdef EIGEN_HAS_SFINAE + template::value) ) + > + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions) + { + resize(internal::customIndices2Array(dimensions)); + } +#endif + +#ifndef EIGEN_EMULATE_CXX11_META_H + template + EIGEN_DEVICE_FUNC + void resize(const Sizes& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = static_cast(dimensions[i]); + } + resize(dims); + } +#else + template + EIGEN_DEVICE_FUNC + void resize(const Sizes& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = static_cast(dimensions[i]); + } + resize(dims); + } +#endif + + protected: + + bool checkIndexRange(const array& indices) const + { + using internal::array_apply_and_reduce; + using internal::array_zip_and_reduce; + using internal::greater_equal_zero_op; + using internal::logical_and_op; + using internal::lesser_op; + + return + // check whether the indices are all >= 0 + array_apply_and_reduce(indices) && + // check whether the indices fit in the dimensions + array_zip_and_reduce(indices, m_storage.dimensions()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const + { + if (Options&RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h new file mode 100644 index 00000000..7da160fd --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -0,0 +1,299 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Eugene Brevdo +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H +#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H + +namespace Eigen { +namespace internal { + +/** \class TensorIndexTuple + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor + Index Tuple class. + * + * + */ +template +struct traits > : public traits +{ + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef Tuple Scalar; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorIndexTupleOp& type; +}; + +template +struct nested, 1, + typename eval >::type> +{ + typedef TensorIndexTupleOp type; +}; + +} // end namespace internal + +template +class TensorIndexTupleOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + typedef Tuple CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexTupleOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorIndexTupleOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + typedef typename TensorEvaluator::Dimensions Dimensions; + static const int NumDims = internal::array_size::value; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = /*TensorEvaluator::PacketAccess*/ false, + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return CoeffReturnType(index, m_impl.coeff(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + TensorEvaluator m_impl; +}; + +namespace internal { + +/** \class TensorTupleIndex + * \ingroup CXX11_Tensor_Module + * + * \brief Converts to Tensor > and reduces to Tensor. + * + */ +template +struct traits > : public traits +{ + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef Index Scalar; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions - array_size::value; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorTupleReducerOp& type; +}; + +template +struct nested, 1, + typename eval >::type> +{ + typedef TensorTupleReducerOp type; +}; + +} // end namespace internal + +template +class TensorTupleReducerOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + typedef Index CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr, + const ReduceOp& reduce_op, + const int return_dim, + const Dims& reduce_dims) + : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + const ReduceOp& reduce_op() const { return m_reduce_op; } + + EIGEN_DEVICE_FUNC + const Dims& reduce_dims() const { return m_reduce_dims; } + + EIGEN_DEVICE_FUNC + int return_dim() const { return m_return_dim; } + + protected: + typename XprType::Nested m_xpr; + const ReduceOp m_reduce_op; + const int m_return_dim; + const Dims m_reduce_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorTupleReducerOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename TensorIndexTupleOp::CoeffReturnType TupleType; + typedef typename TensorEvaluator >, Device>::Dimensions Dimensions; + typedef typename TensorEvaluator , Device>::Dimensions InputDimensions; + static const int NumDims = internal::array_size::value; + typedef array StrideDims; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = /*TensorEvaluator::PacketAccess*/ false, + BlockAccess = false, + Layout = TensorEvaluator >, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_orig_impl(op.expression(), device), + m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), + m_return_dim(op.return_dim()) { + + gen_strides(m_orig_impl.dimensions(), m_strides); + if (Layout == static_cast(ColMajor)) { + const Index total_size = internal::array_prod(m_orig_impl.dimensions()); + m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size; + } else { + const Index total_size = internal::array_prod(m_orig_impl.dimensions()); + m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size; + } + m_stride_div = m_strides[m_return_dim]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + const TupleType v = m_impl.coeff(index); + return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double compute_cost = 1.0 + + (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost() + TensorOpCost::DivCost())); + return m_orig_impl.costPerCoeff(vectorized) + + m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost); + } + + private: + EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) { + if (m_return_dim < 0) { + return; // Won't be using the strides. + } + eigen_assert(m_return_dim < NumDims && + "Asking to convert index to a dimension outside of the rank"); + + // Calculate m_stride_div and m_stride_mod, which are used to + // calculate the value of an index w.r.t. the m_return_dim. + if (Layout == static_cast(ColMajor)) { + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i-1] * dims[i-1]; + } + } else { + strides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i+1] * dims[i+1]; + } + } + } + + protected: + TensorEvaluator, Device> m_orig_impl; + TensorEvaluator >, Device> m_impl; + const int m_return_dim; + StrideDims m_strides; + Index m_stride_mod; + Index m_stride_div; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h new file mode 100644 index 00000000..e13ba738 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -0,0 +1,181 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H +#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H + +namespace Eigen { + +/** \class TensorAssign + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor assignment class. + * + * This class is represents the assignment of the values resulting from the evaluation of + * the rhs expression to the memory locations denoted by the lhs expression. + */ +namespace internal { +template +struct traits > +{ + typedef typename LhsXprType::Scalar Scalar; + typedef typename traits::StorageKind StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const std::size_t NumDimensions = internal::traits::NumDimensions; + static const int Layout = internal::traits::Layout; + + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorAssignOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorAssignOp type; +}; + +} // end namespace internal + + + +template +class TensorAssignOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename LhsXprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + typename internal::remove_all::type& + lhsExpression() const { return *((typename internal::remove_all::type*)&m_lhs_xpr); } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename internal::remove_all::type& m_lhs_xpr; + const typename internal::remove_all::type& m_rhs_xpr; +}; + + +template +struct TensorEvaluator, Device> +{ + typedef TensorAssignOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename TensorEvaluator::Dimensions Dimensions; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + RawAccess = TensorEvaluator::RawAccess + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + } + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // The dimensions of the lhs and the rhs tensors should be equal to prevent + // overflows and ensure the result is fully initialized. + // TODO: use left impl instead if right impl dimensions are known at compile time. + return m_rightImpl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + m_leftImpl.evalSubExprsIfNeeded(NULL); + // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non + // null value), attempt to evaluate the rhs expression in place. Returns true iff in place + // evaluation isn't supported and the caller still needs to manually assign the values generated + // by the rhs to the lhs. + return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); + } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_leftImpl.coeff(index); + } + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + return m_leftImpl.template packet(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here, but reduce left + // cost by one load because we are using m_leftImpl.coeffRef. + TensorOpCost left = m_leftImpl.costPerCoeff(vectorized); + return m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost( + numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)), + left.bytes_stored(), left.compute_cycles()) + + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + + /// required by sycl in order to extract the accessor + const TensorEvaluator& left_impl() const { return m_leftImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& right_impl() const { return m_rightImpl; } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } + + private: + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; +}; + +} + + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h new file mode 100644 index 00000000..49c07959 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -0,0 +1,1012 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H +#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H + +// clang-format off + +namespace Eigen { + +/** \class TensorBase + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor base class. + * + * This class is the common parent of the Tensor and TensorMap class, thus + * making it possible to use either class interchangably in expressions. + */ +#ifndef EIGEN_PARSED_BY_DOXYGEN +// FIXME Doxygen does not like the inheritance with different template parameters +// Since there is no doxygen documentation inside, we disable it for now +template +class TensorBase +{ + public: + typedef internal::traits DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; + typedef typename internal::remove_const::type CoeffReturnType; + static const int NumDimensions = DerivedTraits::NumDimensions; + + // Generic nullary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp + nullaryExpr(const CustomNullaryOp& func) const { + return TensorCwiseNullaryOp(derived(), func); + } + + // Coefficient-wise nullary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> + constant(const Scalar& value) const { + return nullaryExpr(internal::scalar_constant_op(value)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> + random() const { + return nullaryExpr(internal::UniformRandomGenerator()); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp + random(const RandomGenerator& gen = RandomGenerator()) const { + return nullaryExpr(gen); + } + + // Tensor generation + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorGeneratorOp + generate(const Generator& generator) const { + return TensorGeneratorOp(derived(), generator); + } + + // Generic unary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp + unaryExpr(const CustomUnaryOp& func) const { + return TensorCwiseUnaryOp(derived(), func); + } + + // Coefficient-wise unary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator-() const { + return unaryExpr(internal::scalar_opposite_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + sqrt() const { + return unaryExpr(internal::scalar_sqrt_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + sign() const { + return unaryExpr(internal::scalar_sign_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + rsqrt() const { + return unaryExpr(internal::scalar_rsqrt_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + square() const { + return unaryExpr(internal::scalar_square_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + cube() const { + return unaryExpr(internal::scalar_cube_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + inverse() const { + return unaryExpr(internal::scalar_inverse_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + tanh() const { + return unaryExpr(internal::scalar_tanh_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + lgamma() const { + return unaryExpr(internal::scalar_lgamma_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + digamma() const { + return unaryExpr(internal::scalar_digamma_op()); + } + + // igamma(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igamma(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igamma_op()); + } + + // igammac(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igammac(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igammac_op()); + } + + // zeta(x = this, q = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + zeta(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_zeta_op()); + } + + // polygamma(n = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + polygamma(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_polygamma_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + erf() const { + return unaryExpr(internal::scalar_erf_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + erfc() const { + return unaryExpr(internal::scalar_erfc_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + sigmoid() const { + return unaryExpr(internal::scalar_sigmoid_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + exp() const { + return unaryExpr(internal::scalar_exp_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log() const { + return unaryExpr(internal::scalar_log_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log1p() const { + return unaryExpr(internal::scalar_log1p_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + abs() const { + return unaryExpr(internal::scalar_abs_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + conjugate() const { + return unaryExpr(internal::scalar_conjugate_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + pow(Scalar exponent) const { + return unaryExpr(internal::bind2nd_op >(exponent)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + real() const { + return unaryExpr(internal::scalar_real_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + imag() const { + return unaryExpr(internal::scalar_imag_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator+ (Scalar rhs) const { + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator+ (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator- (Scalar rhs) const { + EIGEN_STATIC_ASSERT((NumTraits::IsSigned || internal::is_same >::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator- (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator* (Scalar rhs) const { + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator* (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator/ (Scalar rhs) const { + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator/ (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator% (Scalar rhs) const { + EIGEN_STATIC_ASSERT(NumTraits::IsInteger, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD); + return unaryExpr(internal::scalar_mod_op(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + cwiseMax(Scalar threshold) const { + return cwiseMax(constant(threshold)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + cwiseMin(Scalar threshold) const { + return cwiseMin(constant(threshold)); + } + + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorConversionOp + cast() const { + return TensorConversionOp(derived()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + round() const { + return unaryExpr(internal::scalar_round_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + ceil() const { + return unaryExpr(internal::scalar_ceil_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + floor() const { + return unaryExpr(internal::scalar_floor_op()); + } + + // Generic binary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp + binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const { + return TensorCwiseBinaryOp(derived(), other, func); + } + + // Coefficient-wise binary operators. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator+(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_sum_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator-(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_difference_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator*(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_product_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator/(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_quotient_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + cwiseMax(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_max_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + cwiseMin(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_min_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp + operator&&(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_and_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp + operator||(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_or_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp + operator^(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_xor_op()); + } + + // Comparisons and tests. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator<(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator<=(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator>(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator>=(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator==(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator!=(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + + // comparisons and tests for Scalars + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator<(Scalar threshold) const { + return operator<(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator<=(Scalar threshold) const { + return operator<=(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator>(Scalar threshold) const { + return operator>(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator>=(Scalar threshold) const { + return operator>=(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator==(Scalar threshold) const { + return operator==(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator!=(Scalar threshold) const { + return operator!=(constant(threshold)); + } + + // Checks + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + (isnan)() const { + return unaryExpr(internal::scalar_isnan_op()); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + (isinf)() const { + return unaryExpr(internal::scalar_isinf_op()); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + (isfinite)() const { + return unaryExpr(internal::scalar_isfinite_op()); + } + + // Coefficient-wise ternary operators. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSelectOp + select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { + return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); + } + + // Contractions. + typedef Eigen::IndexPair DimensionPair; + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorContractionOp + contract(const OtherDerived& other, const Dimensions& dims) const { + return TensorContractionOp(derived(), other.derived(), dims); + } + + // Convolutions. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConvolutionOp + convolve(const KernelDerived& kernel, const Dimensions& dims) const { + return TensorConvolutionOp(derived(), kernel.derived(), dims); + } + + // Fourier transforms + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorFFTOp + fft(const FFT& fft) const { + return TensorFFTOp(derived(), fft); + } + + // Scan. + typedef TensorScanOp, const Derived> TensorScanSumOp; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanSumOp + cumsum(const Index& axis, bool exclusive = false) const { + return TensorScanSumOp(derived(), axis, exclusive); + } + + typedef TensorScanOp, const Derived> TensorScanProdOp; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanProdOp + cumprod(const Index& axis, bool exclusive = false) const { + return TensorScanProdOp(derived(), axis, exclusive); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanOp + scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const { + return TensorScanOp(derived(), axis, exclusive, reducer); + } + + // Reductions. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + sum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::SumReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + sum() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::SumReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + mean(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MeanReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + mean() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MeanReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + prod(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::ProdReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + prod() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::ProdReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + maximum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + maximum() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MaxReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + minimum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + minimum() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MinReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp > + all(const Dims& dims) const { + return cast().reduce(dims, internal::AndReducer()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const TensorConversionOp > + all() const { + DimensionList in_dims; + return cast().reduce(in_dims, internal::AndReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp > + any(const Dims& dims) const { + return cast().reduce(dims, internal::OrReducer()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const TensorConversionOp > + any() const { + DimensionList in_dims; + return cast().reduce(in_dims, internal::OrReducer()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTupleReducerOp< + internal::ArgMaxTupleReducer >, + const array, const Derived> + argmax() const { + array in_dims; + for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + return TensorTupleReducerOp< + internal::ArgMaxTupleReducer >, + const array, + const Derived>(derived(), internal::ArgMaxTupleReducer >(), -1, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTupleReducerOp< + internal::ArgMinTupleReducer >, + const array, const Derived> + argmin() const { + array in_dims; + for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + return TensorTupleReducerOp< + internal::ArgMinTupleReducer >, + const array, + const Derived>(derived(), internal::ArgMinTupleReducer >(), -1, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTupleReducerOp< + internal::ArgMaxTupleReducer >, + const array, const Derived> + argmax(const int return_dim) const { + array in_dims; + in_dims[0] = return_dim; + return TensorTupleReducerOp< + internal::ArgMaxTupleReducer >, + const array, + const Derived>(derived(), internal::ArgMaxTupleReducer >(), return_dim, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTupleReducerOp< + internal::ArgMinTupleReducer >, + const array, const Derived> + argmin(const int return_dim) const { + array in_dims; + in_dims[0] = return_dim; + return TensorTupleReducerOp< + internal::ArgMinTupleReducer >, + const array, + const Derived>(derived(), internal::ArgMinTupleReducer >(), return_dim, in_dims); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp + reduce(const Dims& dims, const Reducer& reducer) const { + return TensorReductionOp(derived(), dims, reducer); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorBroadcastingOp + broadcast(const Broadcast& broadcast) const { + return TensorBroadcastingOp(derived(), broadcast); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConcatenationOp + concatenate(const OtherDerived& other, Axis axis) const { + return TensorConcatenationOp(derived(), other.derived(), axis); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPatchOp + extract_patches(const PatchDims& patch_dims) const { + return TensorPatchOp(derived(), patch_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const Index patch_rows = 1, const Index patch_cols = 1, + const Index row_stride = 1, const Index col_stride = 1, + const Index in_row_stride = 1, const Index in_col_stride = 1, + const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { + return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, + in_row_stride, in_col_stride, 1, 1, padding_type, padding_value); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const Index in_row_stride, const Index in_col_stride, + const Index row_inflate_stride, const Index col_inflate_stride, + const Index padding_top, const Index padding_bottom, + const Index padding_left,const Index padding_right, + const Scalar padding_value) const { + return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, + in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride, + padding_top, padding_bottom, padding_left, padding_right, padding_value); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorVolumePatchOp + extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, + const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1, + const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { + return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value); + } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorVolumePatchOp + extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, + const Index plane_stride, const Index row_stride, const Index col_stride, + const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride, + const Index padding_top_z, const Index padding_bottom_z, + const Index padding_top, const Index padding_bottom, + const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const { + return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value); + } + + // Morphing operators. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorLayoutSwapOp + swap_layout() const { + return TensorLayoutSwapOp(derived()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp(derived(), newDimensions); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp(derived(), startIndices, sizes); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingSlicingOp + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { + return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset) const { + return TensorChippingOp(derived(), offset, DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset, const Index dim) const { + return TensorChippingOp(derived(), offset, dim); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReverseOp + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp(derived(), rev); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPaddingOp + pad(const PaddingDimensions& padding) const { + return TensorPaddingOp(derived(), padding, internal::scalar_cast_op()(0)); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPaddingOp + pad(const PaddingDimensions& padding, const Scalar padding_value) const { + return TensorPaddingOp(derived(), padding, padding_value); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorShufflingOp + shuffle(const Shuffle& shuffle) const { + return TensorShufflingOp(derived(), shuffle); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingOp + stride(const Strides& strides) const { + return TensorStridingOp(derived(), strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorInflationOp + inflate(const Strides& strides) const { + return TensorInflationOp(derived(), strides); + } + + // Returns a tensor containing index/value tuples + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorIndexTupleOp + index_tuples() const { + return TensorIndexTupleOp(derived()); + } + + // Support for custom unary and binary operations + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCustomUnaryOp customOp(const CustomUnaryFunc& op) const { + return TensorCustomUnaryOp(derived(), op); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCustomBinaryOp customOp(const OtherDerived& other, const CustomBinaryFunc& op) const { + return TensorCustomBinaryOp(derived(), other, op); + } + + // Force the evaluation of the expression. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorForcedEvalOp eval() const { + return TensorForcedEvalOp(derived()); + } + + protected: + template friend class Tensor; + template friend class TensorFixedSize; + template friend class TensorBase; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } +}; + +template::value> +class TensorBase : public TensorBase { + public: + typedef internal::traits DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; + typedef Scalar CoeffReturnType; + static const int NumDimensions = DerivedTraits::NumDimensions; + + template friend class Tensor; + template friend class TensorFixedSize; + template friend class TensorBase; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setZero() { + return setConstant(Scalar(0)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { + return derived() = this->constant(val); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->random(); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->template random(); + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setValues( + const typename internal::Initializer::InitList& vals) { + TensorEvaluator eval(derived(), DefaultDevice()); + internal::initialize_tensor(eval, vals); + return derived(); + } +#endif // EIGEN_HAS_VARIADIC_TEMPLATES + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator+=(const OtherDerived& other) { + return derived() = derived() + other.derived(); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator-=(const OtherDerived& other) { + return derived() = derived() - other.derived(); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator*=(const OtherDerived& other) { + return derived() = derived() * other.derived(); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator/=(const OtherDerived& other) { + return derived() = derived() / other.derived(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorLayoutSwapOp + swap_layout() const { + return TensorLayoutSwapOp(derived()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorLayoutSwapOp + swap_layout() { + return TensorLayoutSwapOp(derived()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConcatenationOp + concatenate(const OtherDerived& other, const Axis& axis) const { + return TensorConcatenationOp(derived(), other, axis); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorConcatenationOp + concatenate(const OtherDerived& other, const Axis& axis) { + return TensorConcatenationOp(derived(), other, axis); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp(derived(), newDimensions); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReshapingOp + reshape(const NewDimensions& newDimensions) { + return TensorReshapingOp(derived(), newDimensions); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp(derived(), startIndices, sizes); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) { + return TensorSlicingOp(derived(), startIndices, sizes); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingSlicingOp + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { + return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingSlicingOp + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) { + return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset) const { + return TensorChippingOp(derived(), offset, DimId); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp + chip(const Index offset) { + return TensorChippingOp(derived(), offset, DimId); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset, const Index dim) const { + return TensorChippingOp(derived(), offset, dim); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp + chip(const Index offset, const Index dim) { + return TensorChippingOp(derived(), offset, dim); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReverseOp + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp(derived(), rev); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReverseOp + reverse(const ReverseDimensions& rev) { + return TensorReverseOp(derived(), rev); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorShufflingOp + shuffle(const Shuffle& shuffle) const { + return TensorShufflingOp(derived(), shuffle); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorShufflingOp + shuffle(const Shuffle& shuffle) { + return TensorShufflingOp(derived(), shuffle); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingOp + stride(const Strides& strides) const { + return TensorStridingOp(derived(), strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingOp + stride(const Strides& strides) { + return TensorStridingOp(derived(), strides); + } + + // Select the device on which to evaluate the expression. + template + TensorDevice device(const DeviceType& device) { + return TensorDevice(device, derived()); + } + + protected: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& derived() { return *static_cast(this); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } +}; +#endif // EIGEN_PARSED_BY_DOXYGEN +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h new file mode 100644 index 00000000..260dcfe6 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -0,0 +1,392 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H +#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H + +namespace Eigen { + +/** \class TensorBroadcasting + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor broadcasting class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorBroadcastingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorBroadcastingOp type; +}; + +template +struct is_input_scalar { + static const bool value = false; +}; +template <> +struct is_input_scalar > { + static const bool value = true; +}; +#ifndef EIGEN_EMULATE_CXX11_META_H +template +struct is_input_scalar > { + static const bool value = (Sizes::total_size == 1); +}; +#endif + +} // end namespace internal + + + +template +class TensorBroadcastingOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast) + : m_xpr(expr), m_broadcast(broadcast) {} + + EIGEN_DEVICE_FUNC + const Broadcast& broadcast() const { return m_broadcast; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Broadcast m_broadcast; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorBroadcastingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions InputDimensions; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = true, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_broadcast(op.broadcast()),m_impl(op.expression(), device) + { + // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar + // and store the result in a scalar. Instead one should reshape the scalar into a a N-D + // tensor with N >= 1 of 1 element first and then broadcast. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + const InputDimensions& input_dims = m_impl.dimensions(); + const Broadcast& broadcast = op.broadcast(); + for (int i = 0; i < NumDims; ++i) { + eigen_assert(input_dims[i] > 0); + m_dimensions[i] = input_dims[i] * broadcast[i]; + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } else { + m_inputStrides[NumDims-1] = 1; + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims-2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const + { + if (internal::is_input_scalar::type>::value) { + return m_impl.coeff(0); + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + return coeffColMajor(index); + } else { + return coeffRowMajor(index); + } + } + + // TODO: attempt to speed this up. The integer divisions and modulo are slow + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + inputIndex += index; + } else { + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index % m_impl.dimensions()[0] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[0]); + } + } + return m_impl.coeff(inputIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + { + Index inputIndex = 0; + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq(NumDims-1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims-1]); + inputIndex += index; + } else { + if (internal::index_statically_eq(NumDims-1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[NumDims-1]); + } + } + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const + { + if (internal::is_input_scalar::type>::value) { + return internal::pset1(m_impl.coeff(0)); + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + return packetColMajor(index); + } else { + return packetRowMajor(index); + } + } + + // Ignore the LoadMode and always use unaligned loads since we can't guarantee + // the alignment at compile time. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + Index innermostLoc; + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + innermostLoc = index; + } else { + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index % m_impl.dimensions()[0] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[0]; + } + } + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + values[0] = m_impl.coeff(inputIndex); + for (int i = 1; i < PacketSize; ++i) { + values[i] = coeffColMajor(originalIndex+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + Index innermostLoc; + if (internal::index_statically_eq(NumDims-1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims-1]); + innermostLoc = index; + } else { + if (internal::index_statically_eq(NumDims-1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[NumDims-1]; + } + } + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + values[0] = m_impl.coeff(inputIndex); + for (int i = 1; i < PacketSize; ++i) { + values[i] = coeffRowMajor(originalIndex+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + double compute_cost = TensorOpCost::AddCost(); + if (NumDims > 0) { + for (int i = NumDims - 1; i > 0; --i) { + compute_cost += TensorOpCost::DivCost(); + if (internal::index_statically_eq(i, 1)) { + compute_cost += + TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } else { + if (!internal::index_statically_eq(i, 1)) { + compute_cost += TensorOpCost::MulCost() + + TensorOpCost::ModCost() + + TensorOpCost::AddCost(); + } + } + compute_cost += + TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } + } + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + const TensorEvaluator& impl() const { return m_impl; } + + Broadcast functor() const { return m_broadcast; } + + protected: + const Broadcast m_broadcast; + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h new file mode 100644 index 00000000..e81b0718 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -0,0 +1,384 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H +#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H + +namespace Eigen { + +/** \class TensorKChippingReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor. + * + * + */ + +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions - 1; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorChippingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorChippingOp type; +}; + +template +struct DimensionId +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { + eigen_assert(dim == DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { + return DimId; + } +}; +template <> +struct DimensionId +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) { + eigen_assert(dim >= 0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { + return actual_dim; + } + private: + const DenseIndex actual_dim; +}; + + +} // end namespace internal + + + +template +class TensorChippingOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) + : m_xpr(expr), m_offset(offset), m_dim(dim) { + } + + EIGEN_DEVICE_FUNC + const Index offset() const { return m_offset; } + EIGEN_DEVICE_FUNC + const Index dim() const { return m_dim.actualDim(); } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const Index m_offset; + const internal::DimensionId m_dim; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorChippingOp XprType; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumDims = NumInputDims-1; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets. + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) + { + EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(NumInputDims > m_dim.actualDim()); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + eigen_assert(op.offset() < input_dims[m_dim.actualDim()]); + + int j = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (i != m_dim.actualDim()) { + m_dimensions[j] = input_dims[i]; + ++j; + } + } + + m_stride = 1; + m_inputStride = 1; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < m_dim.actualDim(); ++i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } else { + for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } + m_inputStride *= input_dims[m_dim.actualDim()]; + m_inputOffset = m_stride * op.offset(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + Index inputIndex = index * m_inputStride + m_inputOffset; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = m_impl.coeff(inputIndex); + inputIndex += m_inputStride; + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(m_stride > index); + return m_impl.template packet(index + m_inputOffset); + } else { + const Index idx = index / m_stride; + const Index rem = index - idx * m_stride; + if (rem + PacketSize <= m_stride) { + Index inputIndex = idx * m_inputStride + m_inputOffset + rem; + return m_impl.template packet(inputIndex); + } else { + // Cross the stride boundary. Fallback to slow path. + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index); + ++index; + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + double cost = 0; + if ((static_cast(Layout) == static_cast(ColMajor) && + m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && + m_dim.actualDim() == NumInputDims - 1)) { + cost += TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } else if ((static_cast(Layout) == static_cast(ColMajor) && + m_dim.actualDim() == NumInputDims - 1) || + (static_cast(Layout) == static_cast(RowMajor) && + m_dim.actualDim() == 0)) { + cost += TensorOpCost::AddCost(); + } else { + cost += 3 * TensorOpCost::MulCost() + TensorOpCost::DivCost() + + 3 * TensorOpCost::AddCost(); + } + + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { + CoeffReturnType* result = const_cast(m_impl.data()); + if (((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumDims) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) && + result) { + return result + m_inputOffset; + } else { + return NULL; + } + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex; + if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + inputIndex = index * m_inputStride + m_inputOffset; + } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims-1) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(m_stride > index); + inputIndex = index + m_inputOffset; + } else { + const Index idx = index / m_stride; + inputIndex = idx * m_inputStride + m_inputOffset; + index -= idx * m_stride; + inputIndex += index; + } + return inputIndex; + } + + Dimensions m_dimensions; + Index m_stride; + Index m_inputOffset; + Index m_inputStride; + TensorEvaluator m_impl; + const internal::DimensionId m_dim; + const Device& m_device; +}; + + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorChippingOp XprType; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumDims = NumInputDims-1; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + + if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == 0) || + (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(this->m_stride == 1); + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + internal::pstore(values, x); + Index inputIndex = index * this->m_inputStride + this->m_inputOffset; + for (int i = 0; i < PacketSize; ++i) { + this->m_impl.coeffRef(inputIndex) = values[i]; + inputIndex += this->m_inputStride; + } + } else if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) || + (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(this->m_stride > index); + this->m_impl.template writePacket(index + this->m_inputOffset, x); + } else { + const Index idx = index / this->m_stride; + const Index rem = index - idx * this->m_stride; + if (rem + PacketSize <= this->m_stride) { + const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem; + this->m_impl.template writePacket(inputIndex, x); + } else { + // Cross stride boundary. Fallback to slow path. + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + internal::pstore(values, x); + for (int i = 0; i < PacketSize; ++i) { + this->coeffRef(index) = values[i]; + ++index; + } + } + } + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h new file mode 100644 index 00000000..ecbacc28 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -0,0 +1,361 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H + +namespace Eigen { + +/** \class TensorConcatenationOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor concatenation class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type::ret Scalar; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorConcatenationOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorConcatenationOp type; +}; + +} // end namespace internal + + +template +class TensorConcatenationOp : public TensorBase, WriteAccessors> +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::nested::type Nested; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename NumTraits::Real RealScalar; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const TensorConcatenationOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Axis m_axis; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorConcatenationOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + static const int RightNumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + + eigen_assert(0 <= m_axis && m_axis < NumDims); + const Dimensions& lhs_dims = m_leftImpl.dimensions(); + const Dimensions& rhs_dims = m_rightImpl.dimensions(); + { + int i = 0; + for (; i < m_axis; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + eigen_assert(lhs_dims[i] > 0); // Now i == m_axis. + eigen_assert(rhs_dims[i] > 0); + m_dimensions[i] = lhs_dims[i] + rhs_dims[i]; + for (++i; i < NumDims; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_leftStrides[0] = 1; + m_rightStrides[0] = 1; + m_outputStrides[0] = 1; + + for (int j = 1; j < NumDims; ++j) { + m_leftStrides[j] = m_leftStrides[j-1] * lhs_dims[j-1]; + m_rightStrides[j] = m_rightStrides[j-1] * rhs_dims[j-1]; + m_outputStrides[j] = m_outputStrides[j-1] * m_dimensions[j-1]; + } + } else { + m_leftStrides[NumDims - 1] = 1; + m_rightStrides[NumDims - 1] = 1; + m_outputStrides[NumDims - 1] = 1; + + for (int j = NumDims - 2; j >= 0; --j) { + m_leftStrides[j] = m_leftStrides[j+1] * lhs_dims[j+1]; + m_rightStrides[j] = m_rightStrides[j+1] * rhs_dims[j+1]; + m_outputStrides[j] = m_outputStrides[j+1] * m_dimensions[j+1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear? + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) + { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() + { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow. + // See CL/76180724 comments for more ideas. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Collect dimension-wise indices (subs). + array subs; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[NumDims - 1] = index; + } + + const Dimensions& left_dims = m_leftImpl.dimensions(); + if (subs[m_axis] < left_dims[m_axis]) { + Index left_index; + if (static_cast(Layout) == static_cast(ColMajor)) { + left_index = subs[0]; + for (int i = 1; i < NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + } else { + left_index = subs[NumDims - 1]; + for (int i = NumDims - 2; i >= 0; --i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + } + return m_leftImpl.coeff(left_index); + } else { + subs[m_axis] -= left_dims[m_axis]; + const Dimensions& right_dims = m_rightImpl.dimensions(); + Index right_index; + if (static_cast(Layout) == static_cast(ColMajor)) { + right_index = subs[0]; + for (int i = 1; i < NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + } else { + right_index = subs[NumDims - 1]; + for (int i = NumDims - 2; i >= 0; --i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + } + return m_rightImpl.coeff(right_index); + } + } + + // TODO(phli): Add a real vectorization. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost() + + TensorOpCost::ModCost()); + const double lhs_size = m_leftImpl.dimensions().TotalSize(); + const double rhs_size = m_rightImpl.dimensions().TotalSize(); + return (lhs_size / (lhs_size + rhs_size)) * + m_leftImpl.costPerCoeff(vectorized) + + (rhs_size / (lhs_size + rhs_size)) * + m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_leftStrides; + array m_rightStrides; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; + const Axis m_axis; +}; + +// Eval as lvalue +template + struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorConcatenationOp XprType; + typedef typename Base::Dimensions Dimensions; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) + : Base(op, device) + { + EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + // Collect dimension-wise indices (subs). + array subs; + for (int i = Base::NumDims - 1; i > 0; --i) { + subs[i] = index / this->m_outputStrides[i]; + index -= subs[i] * this->m_outputStrides[i]; + } + subs[0] = index; + + const Dimensions& left_dims = this->m_leftImpl.dimensions(); + if (subs[this->m_axis] < left_dims[this->m_axis]) { + Index left_index = subs[0]; + for (int i = 1; i < Base::NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i]; + } + return this->m_leftImpl.coeffRef(left_index); + } else { + subs[this->m_axis] -= left_dims[this->m_axis]; + const Dimensions& right_dims = this->m_rightImpl.dimensions(); + Index right_index = subs[0]; + for (int i = 1; i < Base::NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i]; + } + return this->m_rightImpl.coeffRef(right_index); + } + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize()); + + EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + internal::pstore(values, x); + for (int i = 0; i < packetSize; ++i) { + coeffRef(index+i) = values[i]; + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h new file mode 100644 index 00000000..06c1e8f6 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -0,0 +1,628 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H + +namespace Eigen { + +/** \class TensorContraction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor contraction class. + * + * + */ +namespace internal { + +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename gebp_traits::type, + typename remove_const::type>::ResScalar Scalar; + + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + + // From NumDims below. + static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; + static const int Layout = traits::Layout; + + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorContractionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorContractionOp type; +}; + +template +struct traits, Device_> > { + typedef Indices_ Indices; + typedef LeftArgType_ LeftArgType; + typedef RightArgType_ RightArgType; + typedef Device_ Device; + + // From NumDims below. + static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; +}; + +} // end namespace internal + +template +class TensorContractionOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename internal::gebp_traits::ResScalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp( + const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} + + EIGEN_DEVICE_FUNC + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Indices m_indices; +}; + + +template +struct TensorContractionEvaluatorBase +{ + typedef typename internal::traits::Indices Indices; + typedef typename internal::traits::LeftArgType LeftArgType; + typedef typename internal::traits::RightArgType RightArgType; + typedef typename internal::traits::Device Device; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + enum { + IsAligned = true, + PacketAccess = (internal::unpacket_traits::size > 1), + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = true + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + typedef DSizes Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorContractionEvaluatorBase(const XprType& op, const Device& device) + : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), + op.lhsExpression(), op.rhsExpression()), device), + m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), + op.rhsExpression(), op.lhsExpression()), device), + m_device(device), + m_result(NULL) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + + DSizes eval_left_dims; + DSizes eval_right_dims; + array, ContractDims> eval_op_indices; + if (static_cast(Layout) == static_cast(ColMajor)) { + // For ColMajor, we keep using the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[i]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[i]; + } + // We keep the pairs of contracting indices. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = op.indices()[i].first; + eval_op_indices[i].second = op.indices()[i].second; + } + } else { + // For RowMajor, we need to reverse the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1]; + } + // We need to flip all the pairs of contracting indices as well as + // reversing the dimensions. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second; + eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first; + } + } + + // Check for duplicate axes and make sure the first index in eval_op_indices + // is increasing. Using O(n^2) sorting is OK since ContractDims is small + for (int i = 0; i < ContractDims; i++) { + for (int j = i + 1; j < ContractDims; j++) { + eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first && + eval_op_indices[j].second != eval_op_indices[i].second && + "contraction axes should be unique"); + if (eval_op_indices[j].first < eval_op_indices[i].first) { + numext::swap(eval_op_indices[j], eval_op_indices[i]); + } + } + } + + array lhs_strides; + lhs_strides[0] = 1; + for (int i = 0; i < LDims-1; ++i) { + lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i]; + } + + array rhs_strides; + rhs_strides[0] = 1; + for (int i = 0; i < RDims-1; ++i) { + rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i]; + } + + if (m_i_strides.size() > 0) m_i_strides[0] = 1; + if (m_j_strides.size() > 0) m_j_strides[0] = 1; + if (m_k_strides.size() > 0) m_k_strides[0] = 1; + + m_i_size = 1; + m_j_size = 1; + m_k_size = 1; + + // To compute the dimension, we simply concatenate the non-contracting + // dimensions of the left and then the right tensor. Additionally, we also + // compute the strides corresponding to the left non-contracting + // dimensions and right non-contracting dimensions. + m_lhs_inner_dim_contiguous = true; + int dim_idx = 0; + unsigned int nocontract_idx = 0; + + for (int i = 0; i < LDims; i++) { + // find if we are contracting on index i of left tensor + bool contracting = false; + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].first == i) { + contracting = true; + break; + } + } + if (!contracting) { + // add dimension size to output dimensions + m_dimensions[dim_idx] = eval_left_dims[i]; + m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; + if (dim_idx != i) { + m_lhs_inner_dim_contiguous = false; + } + if (nocontract_idx+1 < internal::array_size::value) { + m_i_strides[nocontract_idx+1] = + m_i_strides[nocontract_idx] * eval_left_dims[i]; + } else { + m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i]; + } + dim_idx++; + nocontract_idx++; + } + } + + nocontract_idx = 0; + for (int i = 0; i < RDims; i++) { + bool contracting = false; + // find if we are contracting on index i of right tensor + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].second == i) { + contracting = true; + break; + } + } + if (!contracting) { + m_dimensions[dim_idx] = eval_right_dims[i]; + if (nocontract_idx+1 < internal::array_size::value) { + m_j_strides[nocontract_idx+1] = + m_j_strides[nocontract_idx] * eval_right_dims[i]; + } else { + m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i]; + } + m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; + dim_idx++; + nocontract_idx++; + } + } + + // Now compute the strides corresponding to the contracting dimensions. We + // assumed above that non-contracting axes are represented in the same order + // in the matrix as they are in the tensor. This is not the case for + // contracting axes. As the contracting axes must be of the same size in + // each tensor, we'll only look at the first tensor here. + m_rhs_inner_dim_contiguous = true; + m_rhs_inner_dim_reordered = false; + for (int i = 0; i < ContractDims; i++) { + Index left = eval_op_indices[i].first; + Index right = eval_op_indices[i].second; + + Index size = eval_left_dims[left]; + eigen_assert(size == eval_right_dims[right] && + "Contraction axes must be same size"); + + if (i+1 < static_cast(internal::array_size::value)) { + m_k_strides[i+1] = m_k_strides[i] * size; + } else { + m_k_size = m_k_strides[i] * size; + } + m_left_contracting_strides[i] = lhs_strides[left]; + m_right_contracting_strides[i] = rhs_strides[right]; + + if (i > 0 && right < eval_op_indices[i-1].second) { + m_rhs_inner_dim_reordered = true; + } + if (right != i) { + m_rhs_inner_dim_contiguous = false; + } + } + + // If the layout is RowMajor, we need to reverse the m_dimensions + if (static_cast(Layout) == static_cast(RowMajor)) { + for (int i = 0, j = NumDims - 1; i < j; i++, j--) { + numext::swap(m_dimensions[i], m_dimensions[j]); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalTo(m_result); + return true; + } + } + + EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalProduct(buffer); + } + else { + static_cast(this)->template evalProduct(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalProduct(buffer); + } + else { + static_cast(this)->template evalProduct(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalProduct(buffer); + } + else { + static_cast(this)->template evalProduct(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalProduct(buffer); + } + else { + static_cast(this)->template evalProduct(buffer); + } + } + } + } + + template + EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const { + const Index rows = m_i_size; + const Index cols = m_k_size; + + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + const Index lhs_packet_size = internal::unpacket_traits::size; + const Index rhs_packet_size = internal::unpacket_traits::size; + const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned; + const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned; + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, + m_left_contracting_strides, m_k_strides); + RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, + m_right_contracting_strides, m_k_strides); + + const Scalar alpha(1); + const Index resIncr(1); + + // zero out the result buffer (which must be of size at least rows * sizeof(Scalar) + m_device.memset(buffer, 0, rows * sizeof(Scalar)); + + internal::general_matrix_vector_product::run( + rows, cols, lhs, rhs, + buffer, resIncr, alpha); + } + + template + EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + // define mr, nr, and all of my data mapper types + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::gebp_traits Traits; + + const Index nr = Traits::nr; + const Index mr = Traits::mr; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + const Index lhs_packet_size = internal::unpacket_traits::size; + const Index rhs_packet_size = internal::unpacket_traits::size; + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + // Declare GEBP packing and kernel structs + internal::gemm_pack_lhs pack_lhs; + internal::gemm_pack_rhs pack_rhs; + + internal::gebp_kernel gebp; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + // Sizes of the blocks to load in cache. See the Goto paper for details. + internal::TensorContractionBlocking blocking(k, m, n, 1); + const Index kc = blocking.kc(); + const Index mc = numext::mini(m, blocking.mc()); + const Index nc = numext::mini(n, blocking.nc()); + const Index sizeA = mc * kc; + const Index sizeB = kc * nc; + + LhsScalar* blockA = static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar))); + RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); + + for(Index i2=0; i2m_device.deallocate(blockA); + this->m_device.deallocate(blockB); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + + if (m_result != NULL) { + m_device.deallocate(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_result[index]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } + + protected: + // Prevent assignment + TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); + Dimensions m_dimensions; + + contract_t m_k_strides; + contract_t m_left_contracting_strides; + contract_t m_right_contracting_strides; + + bool m_lhs_inner_dim_contiguous; + bool m_rhs_inner_dim_contiguous; + bool m_rhs_inner_dim_reordered; + + left_nocontract_t m_i_strides; + right_nocontract_t m_j_strides; + left_nocontract_t m_left_nocontract_strides; + right_nocontract_t m_right_nocontract_strides; + + Index m_i_size; + Index m_j_size; + Index m_k_size; + + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; + const Device& m_device; + Scalar* m_result; +}; + + +// evaluator for default device +template +struct TensorEvaluator, Device> : + public TensorContractionEvaluatorBase< + TensorEvaluator, Device> > { + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + enum { + Layout = TensorEvaluator::Layout + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + // Could we use NumDimensions here? + typedef DSizes Dimensions; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) { } + + template + EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv(buffer); + return; + } + + this->template evalGemm(buffer); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h new file mode 100644 index 00000000..0728841c --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -0,0 +1,56 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H + + +namespace Eigen { +namespace internal { + +enum { + ShardByRow = 0, + ShardByCol = 1 +}; + + +// Default Blocking Strategy +template +class TensorContractionBlocking { + public: + + typedef typename LhsMapper::Scalar LhsScalar; + typedef typename RhsMapper::Scalar RhsScalar; + + EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : + kc_(k), mc_(m), nc_(n) + { + if (ShardingType == ShardByCol) { + computeProductBlockingSizes(kc_, mc_, nc_, num_threads); + } + else { + computeProductBlockingSizes(kc_, nc_, mc_, num_threads); + } + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } + + private: + Index kc_; + Index mc_; + Index nc_; +}; + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h new file mode 100644 index 00000000..4fcc60b9 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -0,0 +1,1391 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014-2015 Benoit Steiner +// Copyright (C) 2015 Navdeep Jaitly +// Copyright (C) 2014 Eric Martin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + +namespace Eigen { + +template +__device__ EIGEN_STRONG_INLINE void +EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, + const Index m_size, const Index n_size, const Index k_size) { + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + // declare and initialize 64 registers for output 8x8 block + + // prefetch registers + Scalar lhs_pf0; + Scalar lhs_pf1; + Scalar lhs_pf2; + Scalar lhs_pf3; + Scalar lhs_pf4; + Scalar lhs_pf5; + Scalar lhs_pf6; + Scalar lhs_pf7; + + Scalar rhs_pf0; + Scalar rhs_pf1; + Scalar rhs_pf2; + Scalar rhs_pf3; + Scalar rhs_pf4; + Scalar rhs_pf5; + Scalar rhs_pf6; + Scalar rhs_pf7; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in threadIdx.x. + + // On the LHS, we pad each row inside of each block with an extra element. This makes + // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts + // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. + + // On the RHS we just add 8 padding elements to the end of each block. This gives no bank + // conflicts on writes and also none on reads. + + // storage indices + const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; + const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; + + const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; + const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; + const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; + const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; + const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; + const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; + const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; + + const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; + const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; + const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; + const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; + const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; + const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; + const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; + const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; + + // in the loading code, the following variables are important: + // threadIdx.x: the vertical position in an 8x8 block + // threadIdx.y: the vertical index of the 8x8 block in the grid + // threadIdx.z: the horizontal position in an 8x8 block + // k: the horizontal index of the 8x8 block in the grid + // + // The k parameter is implicit (it was the loop counter for a loop that went + // from 0 to <8, but now that loop is unrolled in the below code. + + const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; + const Index lhs_vert = base_m + load_idx_vert; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = conv(0); \ + lhs_pf1 = conv(0); \ + lhs_pf2 = conv(0); \ + lhs_pf3 = conv(0); \ + lhs_pf4 = conv(0); \ + lhs_pf5 = conv(0); \ + lhs_pf6 = conv(0); \ + lhs_pf7 = conv(0); \ + \ + rhs_pf0 = conv(0); \ + rhs_pf1 = conv(0); \ + rhs_pf2 = conv(0); \ + rhs_pf3 = conv(0); \ + rhs_pf4 = conv(0); \ + rhs_pf5 = conv(0); \ + rhs_pf6 = conv(0); \ + rhs_pf7 = conv(0); \ + \ + if (!needs_edge_check || lhs_vert < m_size) { \ + const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ + const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ + const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ + const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ + const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ + const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ + const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ + const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ + \ + if (!needs_edge_check || lhs_horiz_7 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ + } else if (lhs_horiz_6 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + } else if (lhs_horiz_5 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + } else if (lhs_horiz_4 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + } else if (lhs_horiz_3 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + } else if (lhs_horiz_2 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + } else if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + } \ + } \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (!needs_edge_check || rhs_vert < k_size) { \ + const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ + const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ + const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ + const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ + const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ + const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ + const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ + const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ + \ + if (rhs_horiz_7 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ + } else if (rhs_horiz_6 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + } else if (rhs_horiz_5 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + } else if (rhs_horiz_4 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + } else if (rhs_horiz_3 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + } else if (rhs_horiz_2 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + } else if (rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + } \ + } \ + } \ + +#define writeRegToShmem(_) \ + lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ + rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ + \ + lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ + rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ + \ + lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ + rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ + \ + lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ + rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ + \ + lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ + rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ + \ + lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ + rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ + \ + lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ + rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ + \ + lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ + rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = conv(0); \ + Scalar res(i, 1) = conv(0); \ + Scalar res(i, 2) = conv(0); \ + Scalar res(i, 3) = conv(0); \ + Scalar res(i, 4) = conv(0); \ + Scalar res(i, 5) = conv(0); \ + Scalar res(i, 6) = conv(0); \ + Scalar res(i, 7) = conv(0); \ + + internal::scalar_cast_op conv; + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + + #undef prefetchIntoRegisters + #undef writeRegToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + +#define lcol(i) _lcol##i + Scalar lcol(0); + Scalar lcol(1); + Scalar lcol(2); + Scalar lcol(3); + Scalar lcol(4); + Scalar lcol(5); + Scalar lcol(6); + Scalar lcol(7); + +#define rrow(j) _rrow##j + Scalar rrow(0); + Scalar rrow(1); + Scalar rrow(2); + Scalar rrow(3); + Scalar rrow(4); + Scalar rrow(5); + Scalar rrow(6); + Scalar rrow(7); + + // Now x corresponds to k, y to m, and z to n + const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; + const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; + +#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] +#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] + +#define loadData(i, j) \ + lcol(0) = lhs_element(0, j); \ + rrow(0) = rhs_element(i, 0); \ + lcol(1) = lhs_element(1, j); \ + rrow(1) = rhs_element(i, 1); \ + lcol(2) = lhs_element(2, j); \ + rrow(2) = rhs_element(i, 2); \ + lcol(3) = lhs_element(3, j); \ + rrow(3) = rhs_element(i, 3); \ + lcol(4) = lhs_element(4, j); \ + rrow(4) = rhs_element(i, 4); \ + lcol(5) = lhs_element(5, j); \ + rrow(5) = rhs_element(i, 5); \ + lcol(6) = lhs_element(6, j); \ + rrow(6) = rhs_element(i, 6); \ + lcol(7) = lhs_element(7, j); \ + rrow(7) = rhs_element(i, 7); \ + +#define computeCol(j) \ + res(0, j) += lcol(0) * rrow(j); \ + res(1, j) += lcol(1) * rrow(j); \ + res(2, j) += lcol(2) * rrow(j); \ + res(3, j) += lcol(3) * rrow(j); \ + res(4, j) += lcol(4) * rrow(j); \ + res(5, j) += lcol(5) * rrow(j); \ + res(6, j) += lcol(6) * rrow(j); \ + res(7, j) += lcol(7) * rrow(j); \ + +#define computePass(i) \ + loadData(i, i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); \ + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol +#undef rrow +#undef lhs_element +#undef rhs_element +#undef loadData +#undef computeCol +#undef computePass + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); \ + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); \ + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (2) is slightly faster than (1) due to less branching and more ILP + + // TODO: won't yield much gain, but could just use currently unused shared mem + // and then we won't have to sync + // wait for shared mem to be out of use + __syncthreads(); + +#define writeResultShmem(i, j) \ + lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \ + +#define writeRow(i) \ + writeResultShmem(i, 0); \ + writeResultShmem(i, 1); \ + writeResultShmem(i, 2); \ + writeResultShmem(i, 3); \ + writeResultShmem(i, 4); \ + writeResultShmem(i, 5); \ + writeResultShmem(i, 6); \ + writeResultShmem(i, 7); \ + + if (threadIdx.x == 0) { + writeRow(0); + writeRow(1); + writeRow(2); + writeRow(3); + writeRow(4); + writeRow(5); + writeRow(6); + writeRow(7); + } +#undef writeResultShmem +#undef writeRow + + const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); + const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); + + if (threadIdx.x < max_i_write) { + if (max_j_write == 8) { + // TODO: can i trade bank conflicts for coalesced writes? + Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; + Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; + Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; + Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; + Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; + Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; + Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; + Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; + + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; + } else { +#pragma unroll 7 + for (int j = 0; j < max_j_write; j++) { + Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; + } + } + } +#undef res +} + + +template +__global__ void +__launch_bounds__(512) +EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ Scalar lhs_shmem[72 * 64]; + __shared__ Scalar rhs_shmem[72 * 64]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } +} + + +template +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][16], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + typedef float Scalar; + + // prefetch registers + float4 lhs_pf0, rhs_pf0; + + float4 results[4]; + for (int i=0; i < 4; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg =lhs.loadPacket(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg =lhs.loadPacket(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ + + + Index lhs_vert = base_m+threadIdx.x*4; + + for (Index k = 0; k < k_size; k += 16) { + lhs_pf0 = internal::pset1(0); + rhs_pf0 = internal::pset1(0); + + Index lhs_horiz = threadIdx.y+k; + prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) + + Index rhs_vert = k+(threadIdx.x%4)*4; + Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n; + + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } else { + if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + float x1, x2 ; + // the following can be a bitwise operation..... some day. + if((threadIdx.x%8) < 4) { + x1 = rhs_pf0.y; + x2 = rhs_pf0.w; + } else { + x1 = rhs_pf0.x; + x2 = rhs_pf0.z; + } + x1 = __shfl_xor(x1, 4); + x2 = __shfl_xor(x2, 4); + if((threadIdx.x%8) < 4) { + rhs_pf0.y = x1; + rhs_pf0.w = x2; + } else { + rhs_pf0.x = x1; + rhs_pf0.z = x2; + } + + // We have 64 features. + // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. + // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. + // ... + // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 + // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 + // ... + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); + + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // ... + // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) + // ... + + lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); + + +#define add_vals(fl1, fl2, fr1, fr2)\ + results[0].x += fl1.x * fr1.x;\ + results[0].y += fl1.y * fr1.x;\ + results[0].z += fl2.x * fr1.x;\ + results[0].w += fl2.y * fr1.x;\ +\ + results[1].x += fl1.x * fr1.y;\ + results[1].y += fl1.y * fr1.y;\ + results[1].z += fl2.x * fr1.y;\ + results[1].w += fl2.y * fr1.y;\ +\ + results[2].x += fl1.x * fr2.x;\ + results[2].y += fl1.y * fr2.x;\ + results[2].z += fl2.x * fr2.x;\ + results[2].w += fl2.y * fr2.x;\ +\ + results[3].x += fl1.x * fr2.y;\ + results[3].y += fl1.y * fr2.y;\ + results[3].z += fl2.x * fr2.y;\ + results[3].w += fl2.y * fr2.y;\ + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 16; koff ++) { + // 32 x threads. + float2 fl1 = lhs_shmem2[koff][threadIdx.x]; + float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; + + int start_feature = threadIdx.y * 4; + float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + + add_vals(fl1, fl2, fr1, fr2) + } + __syncthreads(); + } + +#undef prefetch_lhs +#undef add_vals + + Index horiz_base = threadIdx.y*4+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + // CHECK LHS + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK RHS + /* + int ncols_rem = fminf(n_size- horiz_base, 4); + for (int i = 0; i < ncols_rem; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + }*/ + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][32], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + typedef float Scalar; + + // prefetch registers + float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; + float4 rhs_pf0, rhs_pf1; + + float4 results[8]; + for (int i=0; i < 8; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + + Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; + for (Index k = 0; k < k_size; k += 32) { + lhs_pf0 = internal::pset1(0); + lhs_pf1 = internal::pset1(0); + lhs_pf2 = internal::pset1(0); + lhs_pf3 = internal::pset1(0); + + rhs_pf0 = internal::pset1(0); + rhs_pf1 = internal::pset1(0); + + if (!CHECK_LHS_BOUNDARY) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else { + // just CHECK_LHS_BOUNDARY + if (lhs_vert + 3 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 2 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 1 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + } + } else if (lhs_vert < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + } + } + } + __syncthreads(); + Index rhs_vert = k+threadIdx.x*4; + Index rhs_horiz0 = threadIdx.y*2+base_n; + Index rhs_horiz1 = threadIdx.y*2+1+base_n; + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else { + if (rhs_horiz1 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (k+threadIdx.x*4 + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (k+threadIdx.x*4 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + __syncthreads(); + // Loaded. Do computation + // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. + // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. + // .. + // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 + rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); + // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. + // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. + // .. + rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); + // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. + // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. + rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); + // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. + // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. + rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); + + // LHS. + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // ... + // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + + +#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ + results[0].x += a_feat1.x * f1.x;\ + results[1].x += a_feat1.x * f1.y;\ + results[2].x += a_feat1.x * f2.x;\ + results[3].x += a_feat1.x * f2.y;\ + results[4].x += a_feat1.x * f3.x;\ + results[5].x += a_feat1.x * f3.y;\ + results[6].x += a_feat1.x * f4.x;\ + results[7].x += a_feat1.x * f4.y;\ +\ + results[0].y += a_feat1.y * f1.x;\ + results[1].y += a_feat1.y * f1.y;\ + results[2].y += a_feat1.y * f2.x;\ + results[3].y += a_feat1.y * f2.y;\ + results[4].y += a_feat1.y * f3.x;\ + results[5].y += a_feat1.y * f3.y;\ + results[6].y += a_feat1.y * f4.x;\ + results[7].y += a_feat1.y * f4.y;\ +\ + results[0].z += a_feat2.x * f1.x;\ + results[1].z += a_feat2.x * f1.y;\ + results[2].z += a_feat2.x * f2.x;\ + results[3].z += a_feat2.x * f2.y;\ + results[4].z += a_feat2.x * f3.x;\ + results[5].z += a_feat2.x * f3.y;\ + results[6].z += a_feat2.x * f4.x;\ + results[7].z += a_feat2.x * f4.y;\ +\ + results[0].w += a_feat2.y * f1.x;\ + results[1].w += a_feat2.y * f1.y;\ + results[2].w += a_feat2.y * f2.x;\ + results[3].w += a_feat2.y * f2.y;\ + results[4].w += a_feat2.y * f3.x;\ + results[5].w += a_feat2.y * f3.y;\ + results[6].w += a_feat2.y * f4.x;\ + results[7].w += a_feat2.y * f4.y;\ + + lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); + lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); + lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); + + lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); + lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); + lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); + lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 32; koff ++) { + float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; + float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; + + // first feature is at (threadIdx.y/4) * 8 last is at start + 8. + int start_feature = (threadIdx.y / 4) * 8; + + float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; + float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; + float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; + float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; + + add_vals(a3, a4, br1, br2, br3, br4) + } + __syncthreads(); + } // end loop over k + + + __syncthreads(); + Index horiz_base = (threadIdx.y/4)*8+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK BOUNDARY_B + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__global__ void +__launch_bounds__(256) +EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[64*32]; + __shared__ float2 rhs_shmem[128*8]; + + typedef float2 LHS_MEM[64][32]; + typedef float2 RHS_MEM[128][8]; + + typedef float2 LHS_MEM16x16[32][16]; + typedef float2 RHS_MEM16x16[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 128 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + bool check_rhs = (base_n + 63) >= n_size; + bool check_lhs128 = (base_m + 127) >= m_size; + + if (!check_rhs) { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } else { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } +} + +template +__global__ void +__launch_bounds__(256) +EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[32][16]; + __shared__ float2 rhs_shmem[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size) { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } else { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } +} + + +template +struct TensorEvaluator, GpuDevice> : + public TensorContractionEvaluatorBase, GpuDevice> > { + + typedef GpuDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + enum { + Layout = TensorEvaluator::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + + // We need to redefine this method to make nvcc happy + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; + } + } + + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + } + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + LAUNCH_CUDA_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } + }; + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + if (m < 768 || n < 768) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(16, 16, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } + } + }; + + template + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + EIGEN_UNUSED_VARIABLE(k) + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte); + LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_GPU and __CUDACC__ +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h new file mode 100644 index 00000000..4f31a3cb --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -0,0 +1,469 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H + +namespace Eigen { + +namespace internal { + +enum { + Rhs = 0, + Lhs = 1 +}; + +/* + * Implementation of the Eigen blas_data_mapper class for tensors. + */ + +template struct CoeffLoader { + enum { + DirectOffsets = false + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) { + eigen_assert(false && "unsupported"); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename Tensor::PacketReturnType packet(typename Tensor::Index index) const + { + return m_tensor.template packet(index); + } + + + private: + const Tensor m_tensor; +}; + +template struct CoeffLoader { + enum { + DirectOffsets = true + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { + m_data += offset; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename Tensor::PacketReturnType packet(typename Tensor::Index index) const + { + return internal::ploadt_ro(m_data + index); + } + private: + typedef typename Tensor::Scalar Scalar; + const Scalar* m_data; +}; + +template +class SimpleTensorContractionMapper { + public: + EIGEN_DEVICE_FUNC + SimpleTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + m_tensor(tensor), + m_nocontract_strides(nocontract_strides), + m_ij_strides(ij_strides), + m_contract_strides(contract_strides), + m_k_strides(k_strides) { } + + enum { + DirectOffsets = CoeffLoader::DirectOffsets + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { + m_tensor.offsetBuffer(offset); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row) const { + // column major assumption + return operator()(row, 0); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { + return m_tensor.coeff(computeIndex(row, col)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { + const bool left = (side == Lhs); + EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: xxxps://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 + Index nocontract_val = left ? row : col; + Index linidx = 0; + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx = nocontract_val / m_ij_strides[i]; + linidx += idx * m_nocontract_strides[i]; + nocontract_val -= idx * m_ij_strides[i]; + } + if (array_size::value > array_size::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx += nocontract_val; + } else { + linidx += nocontract_val * m_nocontract_strides[0]; + } + } + + Index contract_val = left ? col : row; + if(array_size::value > 0) { + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx = contract_val / m_k_strides[i]; + linidx += idx * m_contract_strides[i]; + contract_val -= idx * m_k_strides[i]; + } + + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx += contract_val; + } else { + linidx += contract_val * m_contract_strides[0]; + } + } + + return linidx; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, const Index distance) const { + const bool left = (side == Lhs); + EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: xxxps://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 + Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; + Index linidx[2] = {0, 0}; + if (array_size::value > array_size::value) { + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx0 = nocontract_val[0] / m_ij_strides[i]; + const Index idx1 = nocontract_val[1] / m_ij_strides[i]; + linidx[0] += idx0 * m_nocontract_strides[i]; + linidx[1] += idx1 * m_nocontract_strides[i]; + nocontract_val[0] -= idx0 * m_ij_strides[i]; + nocontract_val[1] -= idx1 * m_ij_strides[i]; + } + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx[0] += nocontract_val[0]; + linidx[1] += nocontract_val[1]; + } else { + linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; + linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; + } + } + + Index contract_val[2] = {left ? col : row, left ? col : row + distance}; + if (array_size::value> 0) { + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx0 = contract_val[0] / m_k_strides[i]; + const Index idx1 = contract_val[1] / m_k_strides[i]; + linidx[0] += idx0 * m_contract_strides[i]; + linidx[1] += idx1 * m_contract_strides[i]; + contract_val[0] -= idx0 * m_k_strides[i]; + contract_val[1] -= idx1 * m_k_strides[i]; + } + + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx[0] += contract_val[0]; + linidx[1] += contract_val[1]; + } else { + linidx[0] += contract_val[0] * m_contract_strides[0]; + linidx[1] += contract_val[1] * m_contract_strides[0]; + } + } + return IndexPair(linidx[0], linidx[1]); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { + // Only claim alignment when we can compute the actual stride (ie when we're + // dealing with the lhs with inner_dim_contiguous. This is because the + // matrix-vector product relies on the stride when dealing with aligned inputs. + return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { + return ((side == Lhs) && inner_dim_contiguous && array_size::value > 0) ? m_contract_strides[0] : 1; + } + + protected: + CoeffLoader m_tensor; + const nocontract_t m_nocontract_strides; + const nocontract_t m_ij_strides; + const contract_t m_contract_strides; + const contract_t m_k_strides; +}; + + +template +class BaseTensorContractionMapper : public SimpleTensorContractionMapper +{ + public: + typedef SimpleTensorContractionMapper ParentMapper; + + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + typedef typename Tensor::PacketReturnType Packet; + typedef typename unpacket_traits::half HalfPacket; + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + // current code assumes packet size must be a multiple of 2 + EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + + if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { + const Index index = this->computeIndex(i, j); + eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); + return this->m_tensor.template packet(index); + } + + const IndexPair indexPair = this->computeIndexPair(i, j, packet_size - 1); + const Index first = indexPair.first; + const Index last = indexPair.second; + + // We can always do optimized packet reads from left hand side right now, because + // the vertical matrix dimension on the left hand side is never contracting. + // On the right hand side we need to check if the contracting dimensions may have + // been shuffled first. + if (Tensor::PacketAccess && + (side == Lhs || internal::array_size::value <= 1 || !inner_dim_reordered) && + (last - first) == (packet_size - 1)) { + + return this->m_tensor.template packet(first); + } + + EIGEN_ALIGN_MAX Scalar data[packet_size]; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[packet_size - 1] = this->m_tensor.coeff(last); + + return pload(data); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + const Index half_packet_size = unpacket_traits::size; + if (half_packet_size == packet_size) { + return loadPacket(i, j); + } + EIGEN_ALIGN_MAX Scalar data[half_packet_size]; + for (Index k = 0; k < half_packet_size; k++) { + data[k] = operator()(i + k, j); + } + return pload(data); + } +}; + + +template +class BaseTensorContractionMapper : public SimpleTensorContractionMapper +{ + public: + typedef SimpleTensorContractionMapper ParentMapper; + + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + typedef typename Tensor::PacketReturnType Packet; + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + EIGEN_ALIGN_MAX Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload(data); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { + return loadPacket(i, j); + } +}; + + +template +class TensorContractionSubMapper { + public: + typedef typename Tensor::PacketReturnType Packet; + typedef typename unpacket_traits::half HalfPacket; + + typedef BaseTensorContractionMapper ParentMapper; + typedef TensorContractionSubMapper Self; + typedef Self LinearMapper; + + enum { + // We can use direct offsets iff the parent mapper supports then and we can compute the strides. + // TODO: we should also enable direct offsets for the Rhs case. + UseDirectOffsets = ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size::value > 0) + }; + + EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { + // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute + // this offset every time we attempt to access a coefficient. + if (UseDirectOffsets) { + Index stride = m_base_mapper.stride(); + m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride); + } + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper(i, 0); + } + return m_base_mapper(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper(i, j); + } + return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, 0); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, j); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadHalfPacket(i, 0); + } + return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + if (UseDirectOffsets) { + m_base_mapper.storePacket(i, 0, p); + } + m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + if (UseDirectOffsets) { + return LinearMapper(m_base_mapper, i, j); + } + return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { + EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, 0); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { + return false; + } + + private: + ParentMapper m_base_mapper; + const Index m_vert_offset; + const Index m_horiz_offset; +}; + + +template +class TensorContractionInputMapper + : public BaseTensorContractionMapper { + + public: + typedef Scalar_ Scalar; + typedef BaseTensorContractionMapper Base; + typedef TensorContractionSubMapper SubMapper; + typedef SubMapper VectorMapper; + + EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(*this, i, j); + } +}; + + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h new file mode 100644 index 00000000..bbc0c141 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -0,0 +1,1043 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H + +// evaluator for thread pool device +#ifdef EIGEN_USE_THREADS + +namespace Eigen { + +#ifdef EIGEN_USE_SIMPLE_THREAD_POOL +namespace internal { + +template +struct packLhsArg { + LhsScalar* blockA; + const LhsMapper& lhs; + const Index m_start; + const Index k_start; + const Index mc; + const Index kc; +}; + +template +struct packRhsAndKernelArg { + const MaxSizeVector* blockAs; + RhsScalar* blockB; + const RhsMapper& rhs; + OutputMapper& output; + const Index m; + const Index k; + const Index n; + const Index mc; + const Index kc; + const Index nc; + const Index num_threads; + const Index num_blockAs; + const Index max_m; + const Index k_block_idx; + const Index m_block_idx; + const Index n_block_idx; + const Index m_blocks; + const Index n_blocks; + MaxSizeVector* kernel_notifications; + const MaxSizeVector* lhs_notifications; + const bool need_to_pack; +}; + +} // end namespace internal +#endif // EIGEN_USE_SIMPLE_THREAD_POOL + +template +struct TensorEvaluator, ThreadPoolDevice> : + public TensorContractionEvaluatorBase, ThreadPoolDevice> > { + + typedef ThreadPoolDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + enum { + Layout = TensorEvaluator::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::gebp_traits Traits; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + +#ifndef EIGEN_USE_SIMPLE_THREAD_POOL + template + void evalProduct(Scalar* buffer) const { + typedef internal::TensorContractionInputMapper< + LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, + contract_t, internal::packet_traits::size, + lhs_inner_dim_contiguous, false, Unaligned> + LhsMapper; + typedef internal::TensorContractionInputMapper< + RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, + contract_t, internal::packet_traits::size, + rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> + RhsMapper; + typedef internal::blas_data_mapper OutputMapper; + typedef internal::gemm_pack_lhs + LhsPacker; + typedef internal::gemm_pack_rhs< + RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> + RhsPacker; + typedef internal::gebp_kernel + GebpKernel; + + const Index m = this->m_i_size; + const Index n = this->m_j_size; + const Index k = this->m_k_size; + if (m == 0 || n == 0 || k == 0) return; + + // Compute a set of algorithm parameters: + // - kernel block sizes (bm, bn, bk) + // - task grain sizes (number of kernels executed per task: gm, gn) + // - number of threads + // - sharding by row/column + // - parallel packing or first lhs then rhs + // and some derived parameters: + // - number of tasks (nm, nn, nk) + // - number of kernels (nm0, nn0) + // Unfortunately, all these parameters are tightly interdependent. + // So in some cases we first compute approximate values, then compute other + // values based on these approximations and then refine the approximations. + + // There are lots of heuristics here. There is some reasoning behind them, + // but ultimately they are just tuned on contraction benchmarks for + // different input configurations, thread counts and instruction sets. + // So feel free to question any of them. + + // Compute whether we want to shard by row or by column. + // This is a first approximation, it will be refined later. Since we don't + // know number of threads yet we use 2, because what's we are most + // interested in at this point is whether it makes sense to use + // parallelization at all or not. + bool shard_by_col = shardByCol(m, n, 2); + + // First approximation of kernel blocking sizes. + // Again, we don't know number of threads yet, so we use 2. + Index bm, bn, bk; + if (shard_by_col) { + internal::TensorContractionBlocking + blocking(k, m, n, 2); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } else { + internal::TensorContractionBlocking + blocking(k, m, n, 2); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } + + // Compute optimal number of threads. + // Note: we use bk instead of k here because we are interested in amount of + // _parallelizable_ computations, and computations are not parallelizable + // across k dimension. + const TensorOpCost cost = + contractionCost(m, n, bm, bn, bk, shard_by_col, false); + int num_threads = TensorCostModel::numThreads( + static_cast(n) * m, cost, this->m_device.numThreads()); + + // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost + // model is not tuned. Remove this when the cost model is tuned. + if (n == 1) num_threads = 1; + + if (num_threads == 1) { + // The single-threaded algorithm should be faster in this case. + if (n == 1) + this->template evalGemv(buffer); + else + this->template evalGemm(buffer); + return; + } + + // Now that we know number of threads, recalculate sharding and blocking. + shard_by_col = shardByCol(m, n, num_threads); + if (shard_by_col) { + internal::TensorContractionBlocking + blocking(k, m, n, num_threads); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } else { + internal::TensorContractionBlocking + blocking(k, m, n, num_threads); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } + + // Number of kernels for each dimension. + Index nm0 = divup(m, bm); + Index nn0 = divup(n, bn); + Index nk = divup(k, bk); + + // Calculate task grain size (number of kernels executed per task). + // This task size coarsening serves two purposes: + // 1. It reduces per-task overheads including synchronization overheads. + // 2. It allows to use caches better (reuse the same packed rhs in several + // consecutive kernels). + Index gm = 1; + Index gn = 1; + // If we are sharding by column, then we prefer to reduce rows first. + if (shard_by_col) { + gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); + gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); + } else { + gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); + gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); + } + // Number of tasks in each dimension. + Index nm = divup(nm0, gm); + Index nn = divup(nn0, gn); + + // Last by not least, decide whether we want to issue both lhs and rhs + // packing in parallel; or issue lhs packing first, and then issue rhs + // packing when lhs packing completes (for !shard_by_col lhs and rhs are + // swapped). Parallel packing allows more parallelism (for both packing and + // kernels), while sequential packing provides better locality (once + // a thread finishes rhs packing it proceed to kernels with that rhs). + // First, we are interested in parallel packing if there are few tasks. + bool parallel_pack = num_threads >= nm * nn; + // Also do parallel packing if all data fits into L2$. + if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <= + l2CacheSize() * num_threads) + parallel_pack = true; + // But don't do it if we will use each rhs only once. Locality seems to be + // more important in this case. + if ((shard_by_col ? nm : nn) == 1) parallel_pack = false; + + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, + this->m_i_strides, this->m_left_contracting_strides, + this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, + this->m_j_strides, this->m_right_contracting_strides, + this->m_k_strides); + + Context(this->m_device, num_threads, lhs, rhs, buffer, m, n, + k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, + shard_by_col, parallel_pack) + .run(); + } + + // Context coordinates a single parallel gemm operation. + template + class Context { + public: + Context(const Device& device, int num_threads, LhsMapper& lhs, + RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, + Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, + Index gn, Index nm0, Index nn0, bool shard_by_col, + bool parallel_pack) + : device_(device), + lhs_(lhs), + rhs_(rhs), + buffer_(buffer), + output_(buffer, tm), + num_threads_(num_threads), + shard_by_col_(shard_by_col), + parallel_pack_(parallel_pack), + m_(tm), + n_(tn), + k_(tk), + bm_(bm), + bn_(bn), + bk_(bk), + nm_(nm), + nn_(nn), + nk_(nk), + gm_(gm), + gn_(gn), + nm0_(nm0), + nn0_(nn0) + { + for (Index x = 0; x < P; x++) { + // Normal number of notifications for k slice switch is + // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only + // nm_ + nn_ notifications, because they will not receive notifications + // from preceeding kernels. + state_switch_[x] = + x == 0 + ? 1 + : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) + + (x == P - 1 ? nm_ * nn_ : 0); + state_packing_ready_[x] = + parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_); + state_kernel_[x] = new std::atomic*[nm_]; + for (Index m = 0; m < nm_; m++) { + state_kernel_[x][m] = new std::atomic[nn_]; + // Kernels generally receive 3 notifications (previous kernel + 2 + // packing), but the first slice won't get notifications from previous + // kernels. + for (Index n = 0; n < nn_; n++) + state_kernel_[x][m][n].store( + (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1), + std::memory_order_relaxed); + } + } + + // Allocate memory for packed rhs/lhs matrices. + size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + size_t lhs_size = + divup(bm_ * bk_ * sizeof(LhsScalar), align) * align; + size_t rhs_size = + divup(bn_ * bk_ * sizeof(RhsScalar), align) * align; + packed_mem_ = static_cast(internal::aligned_malloc( + (nm0_ * lhs_size + nn0_ * rhs_size) * std::min(nk_, P - 1))); + char* mem = static_cast(packed_mem_); + for (Index x = 0; x < numext::mini(nk_, P - 1); x++) { + packed_lhs_[x].resize(nm0_); + for (Index m = 0; m < nm0_; m++) { + packed_lhs_[x][m] = reinterpret_cast(mem); + mem += lhs_size; + } + packed_rhs_[x].resize(nn0_); + for (Index n = 0; n < nn0_; n++) { + packed_rhs_[x][n] = reinterpret_cast(mem); + mem += rhs_size; + } + } + } + + ~Context() { + for (Index x = 0; x < P; x++) { + for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m]; + delete[] state_kernel_[x]; + } + internal::aligned_free(packed_mem_); + } + + void run() { + // Kick off packing of the first slice. + signal_switch(0, 1); + // Wait for overall completion. + // TODO(dvyukov): this wait can lead to deadlock. + // If nthreads contractions are concurrently submitted from worker + // threads, this wait will block all worker threads and the system will + // deadlock. + done_.Wait(); + } + + private: + Notification done_; + const Device& device_; + LhsMapper& lhs_; + RhsMapper& rhs_; + Scalar* const buffer_; + OutputMapper output_; + const int num_threads_; + const bool shard_by_col_; + const bool parallel_pack_; + // Matrix sizes. + const Index m_; + const Index n_; + const Index k_; + // Block sizes. + const Index bm_; + const Index bn_; + const Index bk_; + // Number of tasks. + const Index nm_; + const Index nn_; + const Index nk_; + // Task grain sizes (number of kernels executed per task). + const Index gm_; + const Index gn_; + // Number of blocks (this is different from ni_/nn_ because of task size + // coarsening). + const Index nm0_; + const Index nn0_; + + // Parallelization strategy. + // + // Blocks related to the same k block can run in parallel because they write + // to different output blocks. So we parallelize within k slices, this + // gives us parallelism level of m x n. Before we can start any kernels + // related to k-th slice, we need to issue m lhs packing tasks and n rhs + // packing tasks. + // + // However, there is a bottleneck when we are finishing kernels for k-th + // slice (at the very end there is only 1 runnable kernel). To mitigate this + // bottleneck we allow kernels from k-th and k+1-th slices to run in + // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same + // output block, so they must not run in parallel. + // + // This gives us the following dependency graph. + // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs + // packing tasks. + // Kernel (m, n, k) can start when: + // - kernel (m, n, k-1) has finished + // - lhs packing (m, k) has finished + // - rhs packing (n, k) has finished + // Lhs/rhs packing can start when: + // - all k-1 packing has finished (artificially imposed to limit amount of + // parallel packing) + // + // On top of that we limit runnable tasks to two consecutive k slices. + // This is done to limit amount of memory we need for packed lhs/rhs + // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_). + // + // state_switch_ tracks when we are ready to switch to the next k slice. + // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n). + // These variable are rolling over 3 consecutive k slices: first two we are + // actively executing + one to track completion of kernels in the second + // slice. + static const Index P = 3; + void* packed_mem_; + std::vector packed_lhs_[P - 1]; + std::vector packed_rhs_[P - 1]; + std::atomic** state_kernel_[P]; + // state_switch_ is frequently modified by worker threads, while other + // fields are read-only after constructor. Let's move it to a separate cache + // line to reduce cache-coherency traffic. + char pad_[128]; + std::atomic state_packing_ready_[P]; + std::atomic state_switch_[P]; + + void pack_lhs(Index m, Index k) { + const Index mend = m * gm_ + gm(m); + for (Index m1 = m * gm_; m1 < mend; m1++) + LhsPacker()(packed_lhs_[k % (P - 1)][m1], + lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); + + if (!parallel_pack_ && shard_by_col_) { + signal_packing(k); + } else { + signal_switch(k + 1); + for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0); + } + } + + void pack_rhs(Index n, Index k) { + const Index nend = n * gn_ + gn(n); + for (Index n1 = n * gn_; n1 < nend; n1++) { + if (k == 0) { + // Zero the output memory in parallel. + // On 10000x2x10000 mm zeroing can easily take half of time. + // Zero (bn x m) row. Safe to do here because all kernels that will + // write to this memory depend on completion of this task. + // Note: don't call device_.memset() here. device_.memset() blocks on + // thread pool worker thread, which can lead to underutilization and + // deadlocks. + memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); + } + RhsPacker()(packed_rhs_[k % (P - 1)][n1], + rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); + } + + if (parallel_pack_ || shard_by_col_) { + signal_switch(k + 1); + for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0); + } else { + signal_packing(k); + } + } + + void kernel(Index m, Index n, Index k) { + // Note: order of iteration matters here. Iteration over m is innermost + // because we want to reuse the same packed rhs in consequetive tasks + // (rhs fits into L2$ while lhs only into L3$). + const Index nend = n * gn_ + gn(n); + const Index mend = m * gm_ + gm(m); + if (shard_by_col_) { + for (Index n1 = n * gn_; n1 < nend; n1++) { + for (Index m1 = m * gm_; m1 < mend; m1++) + GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), + packed_lhs_[k % (P - 1)][m1], + packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), + Scalar(1), -1, -1, 0, 0); + } + } else { + for (Index m1 = m * gm_; m1 < mend; m1++) + for (Index n1 = n * gn_; n1 < nend; n1++) { + GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), + packed_lhs_[k % (P - 1)][m1], + packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), + Scalar(1), -1, -1, 0, 0); + } + } + signal_kernel(m, n, k + 1, false); + signal_switch(k + 2); + } + + void signal_packing(Index k) { + eigen_assert(!parallel_pack_); + Index s = state_packing_ready_[k % P].fetch_sub(1); + eigen_assert(s > 0); + if (s != 1) return; + state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_; + enqueue_packing(k, shard_by_col_); + } + + void signal_kernel(Index m, Index n, Index k, bool sync) { + std::atomic* state = &state_kernel_[k % P][m][n]; + Index s = state->load(); + eigen_assert(s > 0); + if (s != 1 && state->fetch_sub(1) != 1) return; + state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); + if (sync) + kernel(m, n, k); + else + device_.enqueueNoNotification([=]() { kernel(m, n, k); }); + } + + void signal_switch(Index k, Index v = 1) { + Index s = state_switch_[k % P].fetch_sub(v); + eigen_assert(s >= v); + if (s != v) return; + + // Ready to switch to the next k slice. + // Reset counter for the next iteration. + state_switch_[k % P] = + (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) + + nm_ * nn_; + if (k < nk_) { + // Issue lhs/rhs packing. Their completion will in turn kick off + // kernels. + if (parallel_pack_) { + enqueue_packing(k, !shard_by_col_); + enqueue_packing(k, shard_by_col_); + } else if (shard_by_col_) { + enqueue_packing(k, false); + } else { + enqueue_packing(k, true); + } + + // Termination handling. + // Because kernel completion signals k + 2 switch, we need to finish nk + // + 2 slices without issuing any tasks on nk + 1 slice. So here we + // pretend that all nk + 1 packing tasks just finish instantly; so that + // nk + 2 switch only waits for completion of nk kernels. + } else if (k == nk_) { + signal_switch(k + 1, + parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)); + } else { + done_.Notify(); + } + } + + // Enqueue all rhs/lhs packing for k-th slice. + void enqueue_packing(Index k, bool rhs) { + enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs); + } + + void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) { + if (end - start == 1) { + if (rhs) + pack_rhs(start, k); + else + pack_lhs(start, k); + } else { + Index mid = (start + end) / 2; + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(mid, end, k, rhs); }); + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(start, mid, k, rhs); }); + } + } + + // Block sizes with accounting for potentially incomplete last block. + Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; } + Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; } + Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; } + // Task grain sizes accounting for potentially incomplete last task. + Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; } + Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; } + + Context(const Context&) = delete; + void operator=(const Context&) = delete; + }; + + // Decide whether we want to shard m x n contraction by columns or by rows. + static bool shardByCol(Index m, Index n, Index num_threads) { + // Note: we are comparing both n and m against Traits::nr, it is not + // a mistake. We are trying to figure out how both n and m will fit into + // the main sharding dimension. + + // Sharding by column is the default + // ... unless there is enough data for vectorization over rows + if (m / num_threads >= Traits::nr && + // and not enough data for vectorization over columns + (n / num_threads < Traits::nr || + // ... or barely enough data for vectorization over columns, + // but it is not evenly dividable across threads + (n / num_threads < 4 * Traits::nr && + (n % (num_threads * Traits::nr)) != 0 && + // ... and it is evenly dividable across threads for rows + ((m % (num_threads * Traits::nr)) == 0 || + // .. or it is not evenly dividable for both dimensions but + // there is much more data over rows so that corner effects are + // mitigated. + (m / n >= 6))))) + return false; + // Wait, or if matrices are just substantially prolonged over the other + // dimension. + if (n / num_threads < 16 * Traits::nr && m > n * 32) return false; + return true; + } + + Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn, + int num_threads, bool shard_by_col) const { + Index gm = 1; + Index gm1 = 1; + Index nm0 = divup(m, bm); + Index nm1 = nm0; + for (;;) { + // Find the next candidate for m grain size. It needs to result in + // different number of blocks. E.g. if we have 10 kernels, we want to try + // 5 and 10, but not 6, 7, 8 and 9. + while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++; + if (gm1 > nm0) break; + // Check the candidate. + int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, + shard_by_col); + if (res < 0) break; + nm1 = divup(nm0, gm1); + if (res == 0) continue; + // Commit new grain size. + gm = gm1; + } + return gm; + } + + Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm, + int num_threads, bool shard_by_col) const { + Index gn = 1; + Index gn1 = 1; + Index nn0 = divup(n, bn); + Index nn1 = nn0; + for (;;) { + while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++; + if (gn1 > nn0) break; + int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, + shard_by_col); + if (res < 0) break; + nn1 = divup(nn0, gn1); + if (res == 0) continue; + gn = gn1; + } + return gn; + } + + // checkGrain checks whether grain (gm, gn) is suitable and is better than + // (oldgm, oldgn). + int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm, + Index gn, Index oldgm, Index oldgn, int num_threads, + bool shard_by_col) const { + const TensorOpCost cost = + contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true); + double taskSize = TensorCostModel::taskSize( + static_cast(bm) * gm * bn * gn, cost); + // If the task is too small, then we agree on it regardless of anything + // else. Otherwise synchronization overheads will dominate. + if (taskSize < 1) return 1; + // If it is too large, then we reject it and all larger tasks. + if (taskSize > 2) return -1; + // Now we are in presumably good task size range. + // The main deciding factor here is parallelism. Consider that we have 12 + // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes. + // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4 + // of cores will be busy). While grain size 3 gives us 4 tasks, which gives + // us parallelism of 1 (we can load all cores). + Index nm0 = divup(m, bm); + Index nn0 = divup(n, bn); + Index new_tasks = divup(nm0, gm) * divup(nn0, gn); + double new_parallelism = static_cast(new_tasks) / + (divup(new_tasks, num_threads) * num_threads); + Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn); + double old_parallelism = static_cast(old_tasks) / + (divup(old_tasks, num_threads) * num_threads); + if (new_parallelism > old_parallelism || new_parallelism == 1) return 1; + return 0; + } + +#else // EIGEN_USE_SIMPLE_THREAD_POOL + + template + void evalProduct(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv(buffer); + return; + } + + evalGemm(buffer); + } + + template + void evalGemm(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + + const int lhs_packet_size = internal::unpacket_traits::size; + const int rhs_packet_size = internal::unpacket_traits::size; + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + // TODO: packing could be faster sometimes if we supported row major tensor mappers + typedef internal::gemm_pack_lhs LhsPacker; + typedef internal::gemm_pack_rhs RhsPacker; + + // TODO: replace false, false with conjugate values? + typedef internal::gebp_kernel GebpKernel; + + typedef internal::packLhsArg packLArg; + typedef internal::packRhsAndKernelArg packRKArg; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + // compute block sizes (which depend on number of threads) + const Index num_threads = this->m_device.numThreads(); + internal::TensorContractionBlocking blocking(k, m, n, num_threads); + Index mc = blocking.mc(); + Index nc = blocking.nc(); + Index kc = blocking.kc(); + eigen_assert(mc <= m); + eigen_assert(nc <= n); + eigen_assert(kc <= k); + +#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) + const Index k_blocks = CEIL_DIV(k, kc); + const Index n_blocks = CEIL_DIV(n, nc); + const Index m_blocks = CEIL_DIV(m, mc); + const Index sizeA = mc * kc; + const Index sizeB = kc * nc; + + /* cout << "m: " << m << " n: " << n << " k: " << k << endl; + cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl; + cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl; + cout << "num threads: " << num_threads << endl; + */ + + // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB + // aren't 16 byte aligned segfaults will happen due to SIMD instructions + // note: You can get away with allocating just a single blockA and offsets and meet the + // the alignment requirements with the assumption that + // (Traits::mr * sizeof(ResScalar)) % 16 == 0 + const Index numBlockAs = numext::mini(num_threads, m_blocks); + MaxSizeVector blockAs(num_threads); + for (int i = 0; i < num_threads; i++) { + blockAs.push_back(static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); + } + + // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread + // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. + // Other options: (1) reuse memory when a thread finishes. con: tricky + // (2) allocate block B memory in each thread. con: overhead + MaxSizeVector blockBs(n_blocks); + for (int i = 0; i < n_blocks; i++) { + blockBs.push_back(static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); + } + + // lhs_notifications starts with all null Notifications + MaxSizeVector lhs_notifications(num_threads, nullptr); + + // this should really be numBlockAs * n_blocks; + const Index num_kernel_notifications = num_threads * n_blocks; + MaxSizeVector kernel_notifications(num_kernel_notifications, + nullptr); + + for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { + const Index k_start = k_block_idx * kc; + // make sure we don't overshoot right edge of left matrix + const Index actual_kc = numext::mini(k_start + kc, k) - k_start; + + for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { + const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs); + + for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { + const Index m_start = mt_block_idx * mc; + const Index actual_mc = numext::mini(m_start + mc, m) - m_start; + eigen_assert(actual_mc > 0); + + Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; + + for (int i = 0; i < n_blocks; ++i) { + Index notification_id = (blockAId * n_blocks + i); + // Wait for any current kernels using this slot to complete + // before using it. + if (kernel_notifications[notification_id]) { + wait_until_ready(kernel_notifications[notification_id]); + delete kernel_notifications[notification_id]; + } + kernel_notifications[notification_id] = new Notification(); + } + const packLArg arg = { + blockAs[blockAId], // blockA + lhs, // lhs + m_start, // m + k_start, // k + actual_mc, // mc + actual_kc, // kc + }; + + // Delete any existing notification since we may be + // replacing it. The algorithm should ensure that there are + // no existing waiters on this notification. + delete lhs_notifications[blockAId]; + lhs_notifications[blockAId] = + this->m_device.enqueue(&Self::packLhs, arg); + } + + // now start kernels. + const Index m_base_start = m_block_idx * mc; + const bool need_to_pack = m_block_idx == 0; + + for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { + const Index n_start = n_block_idx * nc; + const Index actual_nc = numext::mini(n_start + nc, n) - n_start; + + // first make sure the previous kernels are all done before overwriting rhs. Also wait if + // we're going to start new k. In both cases need_to_pack is true. + if (need_to_pack) { + for (Index i = num_blocks; i < num_threads; ++i) { + Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads; + Index future_id = (blockAId * n_blocks + n_block_idx); + wait_until_ready(kernel_notifications[future_id]); + } + } + + packRKArg arg = { + &blockAs, // blockA + blockBs[n_block_idx], // blockB + rhs, // rhs + output, // output + m_base_start, // m + k_start, // k + n_start, // n + mc, // mc + actual_kc, // kc + actual_nc, // nc + num_threads, + numBlockAs, + m, + k_block_idx, + m_block_idx, + n_block_idx, // n_block_idx + m_blocks, // m_blocks + n_blocks, // n_blocks + &kernel_notifications, // kernel notifications + &lhs_notifications, // lhs notifications + need_to_pack, // need_to_pack + }; + + // We asynchronously kick off this function, which ends up + // notifying the appropriate kernel_notifications objects, + // which this thread waits on before exiting. + this->m_device.enqueueNoNotification(&Self::packRhsAndKernel, arg); + } + } + } + + // Make sure all the kernels are done. + for (size_t i = 0; i < kernel_notifications.size(); ++i) { + wait_until_ready(kernel_notifications[i]); + delete kernel_notifications[i]; + } + + // No need to wait for lhs notifications since they should have + // already been waited on. Just clean them up. + for (size_t i = 0; i < lhs_notifications.size(); ++i) { + delete lhs_notifications[i]; + } + + // deallocate all of the memory for both A and B's + for (size_t i = 0; i < blockAs.size(); i++) { + this->m_device.deallocate(blockAs[i]); + } + for (size_t i = 0; i < blockBs.size(); i++) { + this->m_device.deallocate(blockBs[i]); + } + +#undef CEIL_DIV + } + + /* + * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing + * the LHS block, check that all of the kernels that worked on the same + * mt_block_idx in the previous m_block are done. + */ + template + static void packLhs(const packLArg arg) { + // perform actual packing + LhsPacker pack_lhs; + pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc); + } + + /* + * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that + * all kernels in the previous block are done. + * Then for each LHS future, we wait on the future and then call GEBP + * on the area packed by the future (which starts at + * blockA + future_idx * mt * kc) on the LHS and with the full packed + * RHS block. + * The output of this GEBP is written to output(m + i * mt, n). + */ + template + static void packRhsAndKernel(packRKArg arg) { + if (arg.need_to_pack) { + RhsPacker pack_rhs; + pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc); + } + + GebpKernel gebp; + for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) { + const Index m_base_start = arg.m + arg.mc*mt_block_idx; + if (m_base_start < arg.max_m) { + Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; + wait_until_ready((*arg.lhs_notifications)[blockAId]); + const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start; + gebp(arg.output.getSubMapper(m_base_start, arg.n), + (*arg.blockAs)[blockAId], arg.blockB, + actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0); + + // Notify that the kernel is done. + const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; + (*arg.kernel_notifications)[set_idx]->Notify(); + } + } + } +#endif // EIGEN_USE_SIMPLE_THREAD_POOL + + TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, + bool shard_by_col, bool prepacked) const { + const int packed_size = std::min(PacketType::size, + PacketType::size); + const int output_packet_size = internal::unpacket_traits::size; + const double kd = static_cast(bk); + // Peak VFMA bandwidth is 0.5. However if we have not enough data for + // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined + // experimentally. + double computeBandwidth = bk == 1 ? 4.0 : + (shard_by_col ? bn : bm) < Traits::nr || + (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5; +#ifndef EIGEN_VECTORIZE_FMA + // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. + // However for MULPS/ADDPS we have dependent sequence of 2 such instructions, + // so overall bandwidth is 1.0. + if (computeBandwidth == 0.5) computeBandwidth = 1.0; +#endif + // Computations. + TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size); + // Output stores. + cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); + if (prepacked) { + // Packing and kernels are executed in different tasks. When we calculate + // task grain size we look only at kernel cost assuming that kernel + // is more expensive than packing. + return cost; + } + // Lhs/rhs loads + computations. + TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n); + TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m); + // Lhs packing memory cost does not contribute considerably to overall + // execution time because lhs is prefetched early and accessed sequentially. + if (shard_by_col) + lhsCost.dropMemoryCost(); + else + rhsCost.dropMemoryCost(); + return cost + lhsCost + rhsCost; + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_THREADS +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h new file mode 100644 index 00000000..7c33cc5d --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -0,0 +1,279 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H + +namespace Eigen { + +/** \class TensorConversionOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor conversion class. This class makes it possible to vectorize + * type casting operations when the number of scalars per packet in the source + * and the destination type differ + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef TargetType Scalar; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorConversionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorConversionOp type; +}; + +} // end namespace internal + + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + return internal::pcast(m_impl.template packet(index)); + } + + private: + const TensorEvaluator& m_impl; +}; + + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + + SrcPacket src1 = m_impl.template packet(index); + SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); + TgtPacket result = internal::pcast(src1, src2); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + + SrcPacket src1 = m_impl.template packet(index); + SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); + SrcPacket src3 = m_impl.template packet(index + 2 * SrcPacketSize); + SrcPacket src4 = m_impl.template packet(index + 3 * SrcPacketSize); + TgtPacket result = internal::pcast(src1, src2, src3, src4); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + // Only call m_impl.packet() when we have direct access to the underlying data. This + // ensures that we don't compute the subexpression twice. We may however load some + // coefficients twice, but in practice this doesn't negatively impact performance. + if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) { + // Force unaligned memory loads since we can't ensure alignment anymore + return internal::pcast(m_impl.template packet(index)); + } else { + const int TgtPacketSize = internal::unpacket_traits::size; + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TgtType; + internal::scalar_cast_op converter; + EIGEN_ALIGN_MAX typename internal::unpacket_traits::type values[TgtPacketSize]; + for (int i = 0; i < TgtPacketSize; ++i) { + values[i] = converter(m_impl.coeff(index+i)); + } + TgtPacket rslt = internal::pload(values); + return rslt; + } + } + + private: + const TensorEvaluator& m_impl; + const typename TensorEvaluator::Index m_maxIndex; +}; + +template +class TensorConversionOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::nested::type Nested; + typedef Scalar CoeffReturnType; + typedef typename NumTraits::Real RealScalar; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr) + : m_xpr(xpr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + +template struct ConversionSubExprEval { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) { + impl.evalSubExprsIfNeeded(NULL); + return true; + } +}; + +template struct ConversionSubExprEval { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) { + return impl.evalSubExprsIfNeeded(data); + } +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorConversionOp XprType; + typedef typename XprType::Index Index; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef TargetType Scalar; + typedef TargetType CoeffReturnType; + typedef typename internal::remove_all::Scalar>::type SrcType; + typedef typename PacketType::type PacketReturnType; + typedef typename PacketType::type PacketSourceType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = false, + PacketAccess = true, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) + { + return ConversionSubExprEval::value, TensorEvaluator, Scalar>::run(m_impl, data); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() + { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + internal::scalar_cast_op converter; + return converter(m_impl.coeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const bool Vectorizable = TensorEvaluator::PacketAccess & + internal::type_casting_traits::VectorizedCast; + return PacketConv::run(m_impl, index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double cast_cost = TensorOpCost::CastCost(); + if (vectorized) { + const double SrcCoeffRatio = + internal::type_casting_traits::SrcCoeffRatio; + const double TgtCoeffRatio = + internal::type_casting_traits::TgtCoeffRatio; + return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) + + TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize)); + } else { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost); + } + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + template + struct PacketConv { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { + internal::scalar_cast_op converter; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = converter(impl.coeff(index+i)); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + }; + + template + struct PacketConv { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { + const int SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; + const int TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; + PacketConverter, PacketSourceType, PacketReturnType, + SrcCoeffRatio, TgtCoeffRatio> converter(impl); + return converter.template packet(index); + } + }; + + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h new file mode 100644 index 00000000..98d4dc38 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -0,0 +1,1104 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +namespace internal { + +template +class IndexMapper { + public: + IndexMapper(const InputDims& input_dims, const array& kernel_dims, + const array& indices) { + + array dimensions = input_dims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = indices[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + dimensions[index] = result_dim; + } + + array inputStrides; + array outputStrides; + if (static_cast(Layout) == static_cast(ColMajor)) { + inputStrides[0] = 1; + outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; + outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; + } + } else { + inputStrides[NumDims - 1] = 1; + outputStrides[NumDims - 1] = 1; + for (int i = static_cast(NumDims) - 2; i >= 0; --i) { + inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; + } + } + + array cudaInputDimensions; + array cudaOutputDimensions; + array tmp = dimensions; + array ordering; + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = i + offset; + ordering[index] = indices[i]; + tmp[indices[i]] = -1; + cudaInputDimensions[index] = input_dims[indices[i]]; + cudaOutputDimensions[index] = dimensions[indices[i]]; + } + + int written = static_cast(Layout) == static_cast(ColMajor) + ? NumKernelDims + : 0; + for (int i = 0; i < NumDims; ++i) { + if (tmp[i] >= 0) { + ordering[written] = i; + cudaInputDimensions[written] = input_dims[i]; + cudaOutputDimensions[written] = dimensions[i]; + ++written; + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = inputStrides[ordering[i]]; + m_outputStrides[i] = outputStrides[ordering[i]]; + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (i > NumKernelDims) { + m_cudaInputStrides[i] = + m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1]; + m_cudaOutputStrides[i] = + m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1]; + } else { + m_cudaInputStrides[i] = 1; + m_cudaOutputStrides[i] = 1; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (i + 1 < offset) { + m_cudaInputStrides[i] = + m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; + m_cudaOutputStrides[i] = + m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1]; + } else { + m_cudaInputStrides[i] = 1; + m_cudaOutputStrides[i] = 1; + } + } + } + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_cudaInputStrides[d]; + } + inputIndex += p * m_inputStrides[NumKernelDims]; + } else { + std::ptrdiff_t limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_cudaInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_cudaInputStrides[d]; + } + inputIndex += p * m_inputStrides[limit]; + } + return inputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { + Index outputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_cudaOutputStrides[d]; + } + outputIndex += p * m_outputStrides[NumKernelDims]; + } else { + std::ptrdiff_t limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_cudaOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_cudaOutputStrides[d]; + } + outputIndex += p * m_outputStrides[limit]; + } + return outputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + + k * m_inputStrides[offset + 2]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + + k * m_outputStrides[offset + 2]; + } + + private: + static const int NumDims = internal::array_size::value; + array m_inputStrides; + array m_outputStrides; + array m_cudaInputStrides; + array m_cudaOutputStrides; +}; + + + +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type::ret Scalar; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename InputXprType::Nested LhsNested; + typedef typename KernelXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; + + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorConvolutionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorConvolutionOp type; +}; + +} // end namespace internal + + + +template +class TensorConvolutionOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) + : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& + inputExpression() const { return m_input_xpr; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& + kernelExpression() const { return m_kernel_xpr; } + + protected: + typename InputXprType::Nested m_input_xpr; + typename KernelXprType::Nested m_kernel_xpr; + const Indices m_indices; +}; + + +template +struct TensorEvaluator, Device> +{ + typedef TensorConvolutionOp XprType; + + static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; + } + } + + m_dimensions = m_inputImpl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i > 0) { + m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; + } else { + m_kernelStride[0] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; + } + } else { + for (int i = NumKernelDims - 1; i >= 0; --i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i < NumKernelDims - 1) { + m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; + } else { + m_kernelStride[NumKernelDims - 1] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_inputImpl.evalSubExprsIfNeeded(NULL); + preloadKernel(); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + void evalTo(typename XprType::Scalar* buffer) { + evalSubExprsIfNeeded(NULL); + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + CoeffReturnType result = CoeffReturnType(0); + convolve(firstInput(index), 0, NumKernelDims-1, result); + return result; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const + { + Index indices[2] = {index, index+PacketSize-1}; + Index startInputs[2] = {0, 0}; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } + startInputs[0] += indices[0]; + startInputs[1] += indices[1]; + + if (startInputs[1]-startInputs[0] == PacketSize-1) { + PacketReturnType result = internal::pset1(0); + convolvePacket(startInputs[0], 0, NumKernelDims-1, result); + return result; + } else { + EIGEN_ALIGN_MAX Scalar data[PacketSize]; + data[0] = Scalar(0); + convolve(startInputs[0], 0, NumKernelDims-1, data[0]); + for (int i = 1; i < PacketSize-1; ++i) { + data[i] = Scalar(0); + convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); + } + data[PacketSize-1] = Scalar(0); + convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); + return internal::pload(data); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } + startInput += index; + return startInput; + } + + EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolve(input, kernel, DimIndex-1, accum); + } else { + accum += m_inputImpl.coeff(input) * m_kernel[kernel]; + } + } + } + + template + EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolvePacket(input, kernel, DimIndex-1, accum); + } else { + accum = internal::pmadd(m_inputImpl.template packet(input), internal::pset1(m_kernel[kernel]), accum); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + array m_inputStride; + array m_outputStride; + + array m_indexStride; + array m_kernelStride; + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + Dimensions m_dimensions; + + KernelArgType m_kernelArg; + const Scalar* m_kernel; + bool m_local_kernel; + const Device& m_device; +}; + + + + +// Use an optimized implementation of the evaluation code for GPUs whenever possible. +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + +template +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { + return StaticKernelSize; + } +}; +template <> +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { + return kernelSize; + } +}; + +template +__global__ void EigenConvolutionKernel1D( + InputEvaluator eval, + const internal::IndexMapper + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int kernelSize, float* buffer) { + extern __shared__ float s[]; + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSize); + const int num_x_output = last_x - first_x + 1; + + const int first_plane = blockIdx.y * blockDim.y; + const int plane_stride = blockDim.y * gridDim.y; + + for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { + // Load inputs to shared memory + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.y * num_x_input; + #pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); + s[i + plane_kernel_offset] = eval.coeff(tensor_index); + } + + __syncthreads(); + + // Compute the convolution + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + const int kernel_offset = plane_kernel_offset + i; + float result = 0.0f; + #pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSize); ++k) { + result += s[k + kernel_offset] * kernel[k]; + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); + buffer[tensor_index] = result; + } + __syncthreads(); + } +}; + +template +__global__ void EigenConvolutionKernel2D( + InputEvaluator eval, + const internal::IndexMapper + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int numY, const int maxY, const int kernelSizeX, + const int kernelSizeY, float* buffer) { + extern __shared__ float s[]; + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSizeX); + const int num_x_output = last_x - first_x + 1; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + GetKernelSize()(kernelSizeY); + const int num_y_output = last_y - first_y + 1; + + const int first_plane = blockIdx.z * blockDim.z; + const int plane_stride = blockDim.z * gridDim.z; + + for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { + + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.z * num_y_input; + + // Load inputs to shared memory + #pragma unroll + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + const int input_offset = num_x_input * (j + plane_kernel_offset); + #pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); + s[i + input_offset] = eval.coeff(tensor_index); + } + } + + __syncthreads(); + + // Convolution + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + #pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + #pragma unroll + for (int l = 0; l < GetKernelSize()(kernelSizeY); ++l) { + const int kernel_offset = kernelSizeX * l; + const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); + #pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSizeX); ++k) { + result += s[k + input_offset] * kernel[k + kernel_offset]; + } + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); + buffer[tensor_index] = result; + } + } + + __syncthreads(); + } +}; + +template +__global__ void EigenConvolutionKernel3D( + InputEvaluator eval, + const internal::IndexMapper + indexMapper, + const float* __restrict kernel, const size_t numPlanes, const size_t numX, + const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, + const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, + const size_t kernelSizeZ, float* buffer) { + extern __shared__ float s[]; + + // Load inputs to shared memory + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + kernelSizeX; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + kernelSizeY; + + const int first_z = blockIdx.z * maxZ; + const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; + const int num_z_input = last_z - first_z + kernelSizeZ; + + for (int p = 0; p < numPlanes; ++p) { + + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = 0; + + for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); + s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); + } + } + } + + __syncthreads(); + + // Convolution + const int num_z_output = last_z - first_z + 1; + const int num_y_output = last_y - first_y + 1; + const int num_x_output = last_x - first_x + 1; + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + for (int n = 0; n < kernelSizeZ; ++n) { + for (int m = 0; m < kernelSizeY; ++m) { + for (int l = 0; l < kernelSizeX; ++l) { + result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; + } + } + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); + buffer[tensor_index] = result; + } + } + } + __syncthreads(); + } +}; + + + +template +struct TensorEvaluator, GpuDevice> +{ + typedef TensorConvolutionOp XprType; + + static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename TensorEvaluator::Dimensions KernelDimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) + : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = internal::unpacket_traits::size; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + static unsigned int ceil(unsigned int num, unsigned int denom) { + const unsigned int rounded_toward_zero = num / denom; + if (num > rounded_toward_zero * denom) { + return rounded_toward_zero + 1; + } + return rounded_toward_zero; + } + + void executeEval(Scalar* data) const { + typedef typename TensorEvaluator::Dimensions InputDims; + + const int maxSharedMem = m_device.sharedMemPerBlock(); + const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock(); + const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = m_device.getNumCudaMultiProcessors(); + const int warpSize = 32; + + switch (NumKernelDims) { + case 1: { + const int kernel_size = m_kernelImpl.dimensions().TotalSize(); + + const int numX = dimensions()[m_indices[0]]; + const int numP = dimensions().TotalSize() / numX; + int maxX; + dim3 block_size; + + const int single_stride_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 0 + : m_inputImpl.dimensions().rank() - 1; + if (m_indices[0] == single_stride_dim) { + // Maximum the reuse + const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; + maxX = numext::mini(inner_dim, numX); + const int maxP = numext::mini(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); + block_size.x = numext::mini(maxThreadsPerBlock, maxX); + block_size.y = numext::mini(maxThreadsPerBlock / block_size.x, maxP); + } + else { + // Read as much as possible alongside the inner most dimension, that is the plane + const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); + const int maxP = numext::mini(inner_dim, numP); + maxX = numext::mini(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); + + block_size.x = numext::mini(warpSize, maxX); + block_size.y = numext::mini(maxThreadsPerBlock/block_size.x, maxP); + } + + const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); + + dim3 num_blocks(num_x_blocks, numext::mini(num_y_blocks, ceil(numP, block_size.y))); + + + //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices(m_indices[0]); + const array kernel_dims(m_kernelImpl.dimensions()[0]); + internal::IndexMapper indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + switch(kernel_size) { + case 4: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + break; + } + case 7: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + break; + } + default: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + } + } + break; + } + + case 2: { + const int idxX = + static_cast(Layout) == static_cast(ColMajor) ? 0 : 1; + const int idxY = + static_cast(Layout) == static_cast(ColMajor) ? 1 : 0; + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numP = dimensions().TotalSize() / (numX*numY); + + const float scaling_factor = sqrtf(static_cast(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); + + // Snap maxX to warp size + int inner_dim = ((static_cast(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; + const int maxX = numext::mini(inner_dim, numX); + const int maxY = numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); + const int maxP = numext::mini(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); + + dim3 block_size; + block_size.x = numext::mini(1024, maxX); + block_size.y = numext::mini(1024/block_size.x, maxY); + block_size.z = numext::mini(1024/(block_size.x*block_size.y), maxP); + + const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int num_y_blocks = ceil(numY, maxY); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); + + dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini(num_z_blocks, ceil(numP, block_size.z))); + + + //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices(m_indices[idxX], m_indices[idxY]); + const array kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY]); + internal::IndexMapper indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + switch (kernel_size_x) { + case 4: { + switch (kernel_size_y) { + case 7: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + break; + } + default: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + break; + } + } + break; + } + case 7: { + switch (kernel_size_y) { + case 4: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + break; + } + default: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + break; + } + } + break; + } + default: { + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + break; + } + } + break; + } + + case 3: { + const int idxX = + static_cast(Layout) == static_cast(ColMajor) ? 0 : 2; + const int idxY = + static_cast(Layout) == static_cast(ColMajor) ? 1 : 1; + const int idxZ = + static_cast(Layout) == static_cast(ColMajor) ? 2 : 0; + + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numZ = dimensions()[m_indices[idxZ]]; + const int numP = dimensions().TotalSize() / (numX*numY*numZ); + + const int maxX = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); + const int maxY = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); + const int maxZ = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); + + dim3 block_size; + block_size.x = numext::mini(32, maxX); + block_size.y = numext::mini(32, maxY); + block_size.z = numext::mini(1024/(block_size.x*block_size.y), maxZ); + dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); + + const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + const array indices(m_indices[idxX], m_indices[idxY], + m_indices[idxZ]); + const array kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY], + m_kernelImpl.dimensions()[idxZ]); + internal::IndexMapper indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf+index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator& operator = (const TensorEvaluator&); + + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + KernelArgType m_kernelArg; + Indices m_indices; + Dimensions m_dimensions; + Scalar* m_buf; + const Scalar* m_kernel; + bool m_local_kernel; + + const GpuDevice& m_device; +}; +#endif + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h new file mode 100644 index 00000000..60cb0458 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -0,0 +1,212 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H +#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief A cost model used to limit the number of threads used for evaluating + * tensor expression. + * + */ + +// Class storing the cost of evaluating a tensor expression in terms of the +// estimated number of operand bytes loads, bytes stored, and compute cycles. +class TensorOpCost { + public: + // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple + // model based on minimal reciprocal throughput numbers from Intel or + // Agner Fog's tables would be better than what is there now. + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() { + return internal::functor_traits< + internal::scalar_product_op >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() { + return internal::functor_traits >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() { + return internal::functor_traits< + internal::scalar_quotient_op >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() { + return internal::functor_traits >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() { + return internal::functor_traits< + internal::scalar_cast_op >::Cost; + } + + EIGEN_DEVICE_FUNC + TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} + EIGEN_DEVICE_FUNC + TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) + : bytes_loaded_(bytes_loaded), + bytes_stored_(bytes_stored), + compute_cycles_(compute_cycles) {} + + EIGEN_DEVICE_FUNC + TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, + bool vectorized, double packet_size) + : bytes_loaded_(bytes_loaded), + bytes_stored_(bytes_stored), + compute_cycles_(vectorized ? compute_cycles / packet_size + : compute_cycles) { + eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded)); + eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored)); + eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { + return bytes_loaded_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { + return bytes_stored_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { + return compute_cycles_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost( + double load_cost, double store_cost, double compute_cost) const { + return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + + compute_cost * compute_cycles_; + } + + // Drop memory access component. Intended for cases when memory accesses are + // sequential or are completely masked by computations. + EIGEN_DEVICE_FUNC void dropMemoryCost() { + bytes_loaded_ = 0; + bytes_stored_ = 0; + } + + // TODO(rmlarsen): Define min in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin( + const TensorOpCost& rhs) const { + double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded()); + double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored()); + double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles()); + return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); + } + + // TODO(rmlarsen): Define max in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax( + const TensorOpCost& rhs) const { + double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); + double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored()); + double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles()); + return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=( + const TensorOpCost& rhs) { + bytes_loaded_ += rhs.bytes_loaded(); + bytes_stored_ += rhs.bytes_stored(); + compute_cycles_ += rhs.compute_cycles(); + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) { + bytes_loaded_ *= rhs; + bytes_stored_ *= rhs; + compute_cycles_ *= rhs; + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+( + TensorOpCost lhs, const TensorOpCost& rhs) { + lhs += rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( + TensorOpCost lhs, double rhs) { + lhs *= rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( + double lhs, TensorOpCost rhs) { + rhs *= lhs; + return rhs; + } + + friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) { + return os << "[bytes_loaded = " << tc.bytes_loaded() + << ", bytes_stored = " << tc.bytes_stored() + << ", compute_cycles = " << tc.compute_cycles() << "]"; + } + + private: + double bytes_loaded_; + double bytes_stored_; + double compute_cycles_; +}; + +// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads +// in [1:max_threads] instead of just switching multi-threading off for small +// work units. +template +class TensorCostModel { + public: + // Scaling from Eigen compute cost to device cycles. + static const int kDeviceCyclesPerComputeCycle = 1; + + // Costs in device cycles. + static const int kStartupCycles = 100000; + static const int kPerThreadCycles = 100000; + static const int kTaskSize = 40000; + + // Returns the number of threads in [1:max_threads] to use for + // evaluating an expression with the given output size and cost per + // coefficient. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( + double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { + double cost = totalCost(output_size, cost_per_coeff); + int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; + return numext::mini(max_threads, numext::maxi(1, threads)); + } + + // taskSize assesses parallel task size. + // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task + // granularity needs to be increased to mitigate parallelization overheads. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize( + double output_size, const TensorOpCost& cost_per_coeff) { + return totalCost(output_size, cost_per_coeff) / kTaskSize; + } + + private: + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( + double output_size, const TensorOpCost& cost_per_coeff) { + // Cost of memory fetches from L2 cache. 64 is typical cache line size. + // 11 is L2 cache latency on Haswell. + // We don't know whether data is in L1, L2 or L3. But we are most interested + // in single-threaded computational time around 100us-10ms (smaller time + // is too small for parallelization, larger time is not intersting + // either because we are probably using all available threads already). + // And for the target time range, L2 seems to be what matters. Data set + // fitting into L1 is too small to take noticeable time. Data set fitting + // only into L3 presumably will take more than 10ms to load and process. + const double kLoadCycles = 1.0 / 64 * 11; + const double kStoreCycles = 1.0 / 64 * 11; + // Scaling from Eigen compute cost to device cycles. + return output_size * + cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, + kDeviceCyclesPerComputeCycle); + } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h new file mode 100644 index 00000000..2e44bcf9 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -0,0 +1,313 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H +#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H + +namespace Eigen { + +/** \class TensorCustomUnaryOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor custom class. + * + * + */ +namespace internal { +template +struct traits > +{ + typedef typename XprType::Scalar Scalar; + typedef typename XprType::StorageKind StorageKind; + typedef typename XprType::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCustomUnaryOp& type; +}; + +template +struct nested > +{ + typedef TensorCustomUnaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCustomUnaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func) + : m_expr(expr), m_func(func) {} + + EIGEN_DEVICE_FUNC + const CustomUnaryFunc& func() const { return m_func; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_expr; } + + protected: + typename XprType::Nested m_expr; + const CustomUnaryFunc m_func; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorCustomUnaryOp ArgType; + typedef typename internal::traits::Index Index; + static const int NumDims = internal::traits::NumDimensions; + typedef DSizes Dimensions; + typedef typename internal::remove_const::type Scalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = false, + PacketAccess = (internal::packet_traits::size > 1), + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) + : m_op(op), m_device(device), m_result(NULL) + { + m_dimensions = op.func().dimensions(op.expression()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast( + m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalTo(m_result); + return true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + if (m_result != NULL) { + m_device.deallocate(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_result[index]; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } + + protected: + EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { + TensorMap > result( + data, m_dimensions); + m_op.func().eval(m_op.expression(), result, m_device); + } + + Dimensions m_dimensions; + const ArgType m_op; + const Device& m_device; + CoeffReturnType* m_result; +}; + + + +/** \class TensorCustomBinaryOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor custom class. + * + * + */ +namespace internal { +template +struct traits > +{ + typedef typename internal::promote_storage_type::ret Scalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCustomBinaryOp& type; +}; + +template +struct nested > +{ + typedef TensorCustomBinaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCustomBinaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::traits::CoeffReturnType CoeffReturnType; + typedef typename internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const CustomBinaryFunc& func) + + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {} + + EIGEN_DEVICE_FUNC + const CustomBinaryFunc& func() const { return m_func; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const CustomBinaryFunc m_func; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorCustomBinaryOp XprType; + typedef typename internal::traits::Index Index; + static const int NumDims = internal::traits::NumDimensions; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = false, + PacketAccess = (internal::packet_traits::size > 1), + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_op(op), m_device(device), m_result(NULL) + { + m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalTo(m_result); + return true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + if (m_result != NULL) { + m_device.deallocate(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_result[index]; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } + + protected: + EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { + TensorMap > result(data, m_dimensions); + m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device); + } + + Dimensions m_dimensions; + const XprType m_op; + const Device& m_device; + CoeffReturnType* m_result; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h new file mode 100644 index 00000000..fd9ecea9 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -0,0 +1,68 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H + +namespace Eigen { + +/** \class TensorDevice + * \ingroup CXX11_Tensor_Module + * + * \brief Pseudo expression providing an operator = that will evaluate its argument + * on the specified computing 'device' (GPU, thread pool, ...) + * + * Example: + * C.device(EIGEN_GPU) = A + B; + * + * Todo: operator *= and /=. + */ + +template class TensorDevice { + public: + TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + template + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + Assign assign(m_expression, other); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + + template + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + + template + EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Difference; + Difference difference(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, difference); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + + protected: + const DeviceType& m_device; + ExpressionType& m_expression; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h new file mode 100644 index 00000000..8e587215 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -0,0 +1,337 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H + +namespace Eigen { + +static const int kCudaScratchSize = 1024; + +// This defines an interface that GPUDevice can take to use +// CUDA streams underneath. +class StreamInterface { + public: + virtual ~StreamInterface() {} + + virtual const cudaStream_t& stream() const = 0; + virtual const cudaDeviceProp& deviceProperties() const = 0; + + // Allocate memory on the actual device where the computation will run + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; + + // Return a scratchpad buffer of size 1k + virtual void* scratchpad() const = 0; + + // Return a semaphore. The semaphore is initially initialized to 0, and + // each kernel using it is responsible for resetting to 0 upon completion + // to maintain the invariant that the semaphore is always equal to 0 upon + // each kernel start. + virtual unsigned int* semaphore() const = 0; +}; + +static cudaDeviceProp* m_deviceProperties; +static bool m_devicePropInitialized = false; + +static void initializeDeviceProp() { + if (!m_devicePropInitialized) { + // Attempts to ensure proper behavior in the case of multiple threads + // calling this function simultaneously. This would be trivial to + // implement if we could use std::mutex, but unfortunately mutex don't + // compile with nvcc, so we resort to atomics and thread fences instead. + // Note that if the caller uses a compiler that doesn't support c++11 we + // can't ensure that the initialization is thread safe. +#if __cplusplus >= 201103L + static std::atomic first(true); + if (first.exchange(false)) { +#else + static bool first = true; + if (first) { + first = false; +#endif + // We're the first thread to reach this point. + int num_devices; + cudaError_t status = cudaGetDeviceCount(&num_devices); + if (status != cudaSuccess) { + std::cerr << "Failed to get the number of CUDA devices: " + << cudaGetErrorString(status) + << std::endl; + assert(status == cudaSuccess); + } + m_deviceProperties = new cudaDeviceProp[num_devices]; + for (int i = 0; i < num_devices; ++i) { + status = cudaGetDeviceProperties(&m_deviceProperties[i], i); + if (status != cudaSuccess) { + std::cerr << "Failed to initialize CUDA device #" + << i + << ": " + << cudaGetErrorString(status) + << std::endl; + assert(status == cudaSuccess); + } + } + +#if __cplusplus >= 201103L + std::atomic_thread_fence(std::memory_order_release); +#endif + m_devicePropInitialized = true; + } else { + // Wait for the other thread to inititialize the properties. + while (!m_devicePropInitialized) { +#if __cplusplus >= 201103L + std::atomic_thread_fence(std::memory_order_acquire); +#endif + sleep(1); + } + } + } +} + +static const cudaStream_t default_stream = cudaStreamDefault; + +class CudaStreamDevice : public StreamInterface { + public: + // Use the default stream on the current device + CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { + cudaGetDevice(&device_); + initializeDeviceProp(); + } + // Use the default stream on the specified device + CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { + initializeDeviceProp(); + } + // Use the specified stream. Note that it's the + // caller responsibility to ensure that the stream can run on + // the specified device. If no device is specified the code + // assumes that the stream is associated to the current gpu device. + CudaStreamDevice(const cudaStream_t* stream, int device = -1) + : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { + if (device < 0) { + cudaGetDevice(&device_); + } else { + int num_devices; + cudaError_t err = cudaGetDeviceCount(&num_devices); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + assert(device < num_devices); + device_ = device; + } + initializeDeviceProp(); + } + + virtual ~CudaStreamDevice() { + if (scratch_) { + deallocate(scratch_); + } + } + + const cudaStream_t& stream() const { return *stream_; } + const cudaDeviceProp& deviceProperties() const { + return m_deviceProperties[device_]; + } + virtual void* allocate(size_t num_bytes) const { + cudaError_t err = cudaSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + void* result; + err = cudaMalloc(&result, num_bytes); + assert(err == cudaSuccess); + assert(result != NULL); + return result; + } + virtual void deallocate(void* buffer) const { + cudaError_t err = cudaSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + assert(buffer != NULL); + err = cudaFree(buffer); + assert(err == cudaSuccess); + } + + virtual void* scratchpad() const { + if (scratch_ == NULL) { + scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + virtual unsigned int* semaphore() const { + if (semaphore_ == NULL) { + char* scratch = static_cast(scratchpad()) + kCudaScratchSize; + semaphore_ = reinterpret_cast(scratch); + cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + } + return semaphore_; + } + + private: + const cudaStream_t* stream_; + int device_; + mutable void* scratch_; + mutable unsigned int* semaphore_; +}; + +struct GpuDevice { + // The StreamInterface is not owned: the caller is + // responsible for its initialization and eventual destruction. + explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { + eigen_assert(stream); + } + explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { + eigen_assert(stream); + } + // TODO(bsteiner): This is an internal API, we should not expose it. + EIGEN_STRONG_INLINE const cudaStream_t& stream() const { + return stream_->stream(); + } + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return stream_->allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + stream_->deallocate(buffer); + } + + EIGEN_STRONG_INLINE void* scratchpad() const { + return stream_->scratchpad(); + } + + EIGEN_STRONG_INLINE unsigned int* semaphore() const { + return stream_->semaphore(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifndef __CUDA_ARCH__ + cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, + stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + cudaError_t err = + cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + } + + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + cudaError_t err = + cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifndef __CUDA_ARCH__ + cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME + return 32; + } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + // FIXME + return 48*1024; + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on cuda devices. + return firstLevelCacheSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { +#if defined(__CUDACC__) && !defined(__CUDA_ARCH__) + cudaError_t err = cudaStreamSynchronize(stream_->stream()); + if (err != cudaSuccess) { + std::cerr << "Error detected in CUDA stream: " + << cudaGetErrorString(err) + << std::endl; + assert(err == cudaSuccess); + } +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { + return stream_->deviceProperties().multiProcessorCount; + } + EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { + return stream_->deviceProperties().maxThreadsPerBlock; + } + EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { + return stream_->deviceProperties().maxThreadsPerMultiProcessor; + } + EIGEN_STRONG_INLINE int sharedMemPerBlock() const { + return stream_->deviceProperties().sharedMemPerBlock; + } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return stream_->deviceProperties().major; + } + EIGEN_STRONG_INLINE int minorDeviceVersion() const { + return stream_->deviceProperties().minor; + } + + EIGEN_STRONG_INLINE int maxBlocks() const { + return max_blocks_; + } + + // This function checks if the CUDA runtime recorded an error for the + // underlying stream device. + inline bool ok() const { +#ifdef __CUDACC__ + cudaError_t error = cudaStreamQuery(stream_->stream()); + return (error == cudaSuccess) || (error == cudaErrorNotReady); +#else + return false; +#endif + } + + private: + const StreamInterface* stream_; + int max_blocks_; +}; + +#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ + assert(cudaGetLastError() == cudaSuccess); + + +// FIXME: Should be device and kernel specific. +#ifdef __CUDACC__ +static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { +#ifndef __CUDA_ARCH__ + cudaError_t status = cudaDeviceSetSharedMemConfig(config); + EIGEN_UNUSED_VARIABLE(status) + assert(status == cudaSuccess); +#else + EIGEN_UNUSED_VARIABLE(config) +#endif +} +#endif + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h new file mode 100644 index 00000000..427772bd --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -0,0 +1,81 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H + + +namespace Eigen { + +// Default device for the machine (typically a single cpu core) +struct DefaultDevice { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { +#ifndef __CUDA_ARCH__ + // Running on the host CPU + return 1; +#else + // Running on a CUDA device + return 32; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { +#ifndef __CUDA_ARCH__ + // Running on the host CPU + return l1CacheSize(); +#else + // Running on a CUDA device, return the amount of shared memory available. + return 48*1024; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { +#ifndef __CUDA_ARCH__ + // Running single threaded on the host CPU + return l3CacheSize(); +#else + // Running on a CUDA device + return firstLevelCacheSize(); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { +#ifndef __CUDA_ARCH__ + // Running single threaded on the host CPU + // Should return an enum that encodes the ISA supported by the CPU + return 1; +#else + // Running on a CUDA device + return __CUDA_ARCH__ / 100; +#endif + } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h new file mode 100644 index 00000000..56181836 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -0,0 +1,122 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Copyright (C) 2016 Benoit Steiner + +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H + +namespace Eigen { +struct SyclDevice { + /// class members + /// sycl queue + mutable cl::sycl::queue m_queue; + /// std::map is the container used to make sure that we create only one buffer + /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. + /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. + mutable std::map> buffer_map; + /// creating device by using selector + template SyclDevice(dev_Selector s) + : +#ifdef EIGEN_EXCEPTIONS + m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) { + for (const auto& e : l) { + try { + std::rethrow_exception(e); + } catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + } + } + })) +#else + m_queue(cl::sycl::queue(s)) +#endif + {} + // destructor + ~SyclDevice() { deallocate_all(); } + + template void deallocate(T *p) const { + auto it = buffer_map.find(p); + if (it != buffer_map.end()) { + buffer_map.erase(it); + internal::aligned_free(p); + } + } + void deallocate_all() const { + std::map>::iterator it=buffer_map.begin(); + while (it!=buffer_map.end()) { + auto p=it->first; + buffer_map.erase(it); + internal::aligned_free(const_cast(p)); + it=buffer_map.begin(); + } + buffer_map.clear(); + } + + /// creation of sycl accessor for a buffer. This function first tries to find + /// the buffer in the buffer_map. If found it gets the accessor from it, if not, + ///the function then adds an entry by creating a sycl buffer for that particular pointer. + template inline cl::sycl::accessor + get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const { + return (get_sycl_buffer(num_bytes, ptr)->template get_access(cgh)); + } + + template inline std::pair>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const { + using Type = cl::sycl::buffer; + std::pair>::iterator,bool> ret = buffer_map.insert(std::pair>(ptr, std::shared_ptr(new Type(cl::sycl::range<1>(num_bytes)), + [](void *dataMem) { delete static_cast(dataMem); }))); + (static_cast(buffer_map.at(ptr).get()))->set_final_data(nullptr); + return ret; + } + + template inline cl::sycl::buffer* get_sycl_buffer(size_t num_bytes,const T * ptr) const { + return static_cast*>(add_sycl_buffer(ptr, num_bytes).first->second.get()); + } + + /// allocating memory on the cpu + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const { + return internal::aligned_malloc(8); + } + + // some runtime conditions that can be applied here + bool isDeviceSuitable() const { return true; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { + ::memcpy(dst, src, n); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const { + auto host_acc= (static_cast*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access(); + memcpy(host_acc.get_pointer(), src, n); + } + /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const { + auto it = buffer_map.find(src); + if (it != buffer_map.end()) { + auto host_acc= (static_cast*>(it->second.get()))-> template get_access(); + memcpy(dst,host_acc.get_pointer(), n); + } else{ + eigen_assert("no device memory found. The memory might be destroyed before creation"); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return 1; + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h new file mode 100644 index 00000000..cfb27a08 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -0,0 +1,282 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H + +namespace Eigen { + +// Use the SimpleThreadPool by default. We'll switch to the new non blocking +// thread pool later. +#ifndef EIGEN_USE_SIMPLE_THREAD_POOL +template using ThreadPoolTempl = NonBlockingThreadPoolTempl; +typedef NonBlockingThreadPool ThreadPool; +#else +template using ThreadPoolTempl = SimpleThreadPoolTempl; +typedef SimpleThreadPool ThreadPool; +#endif + + +// Barrier is an object that allows one or more threads to wait until +// Notify has been called a specified number of times. +class Barrier { + public: + Barrier(unsigned int count) : state_(count << 1), notified_(false) { + eigen_assert(((count << 1) >> 1) == count); + } + ~Barrier() { + eigen_plain_assert((state_>>1) == 0); + } + + void Notify() { + unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2; + if (v != 1) { + eigen_assert(((v + 2) & ~1) != 0); + return; // either count has not dropped to 0, or waiter is not waiting + } + std::unique_lock l(mu_); + eigen_assert(!notified_); + notified_ = true; + cv_.notify_all(); + } + + void Wait() { + unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel); + if ((v >> 1) == 0) return; + std::unique_lock l(mu_); + while (!notified_) { + cv_.wait(l); + } + } + + private: + std::mutex mu_; + std::condition_variable cv_; + std::atomic state_; // low bit is waiter flag + bool notified_; +}; + + +// Notification is an object that allows a user to to wait for another +// thread to signal a notification that an event has occurred. +// +// Multiple threads can wait on the same Notification object, +// but only one caller must call Notify() on the object. +struct Notification : Barrier { + Notification() : Barrier(1) {}; +}; + + +// Runs an arbitrary function and then calls Notify() on the passed in +// Notification. +template struct FunctionWrapperWithNotification +{ + static void run(Notification* n, Function f, Args... args) { + f(args...); + if (n) { + n->Notify(); + } + } +}; + +template struct FunctionWrapperWithBarrier +{ + static void run(Barrier* b, Function f, Args... args) { + f(args...); + if (b) { + b->Notify(); + } + } +}; + +template +static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { + if (n) { + n->Wait(); + } +} + + +// Build a thread pool device on top the an existing pool of threads. +struct ThreadPoolDevice { + // The ownership of the thread pool remains with the caller. + ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { } + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } + + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } + + EIGEN_STRONG_INLINE int numThreads() const { + return num_threads_; + } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + return l1CacheSize(); + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // The l3 cache size is shared between all the cores. + return l3CacheSize() / num_threads_; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { + // Should return an enum that encodes the ISA supported by the CPU + return 1; + } + + template + EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { + Notification* n = new Notification(); + pool_->Schedule(std::bind(&FunctionWrapperWithNotification::run, n, f, args...)); + return n; + } + + template + EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, + Function&& f, + Args&&... args) const { + pool_->Schedule(std::bind( + &FunctionWrapperWithBarrier::run, b, f, args...)); + } + + template + EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { + pool_->Schedule(std::bind(f, args...)); + } + + // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if + // called from one of the threads in pool_. Returns -1 otherwise. + EIGEN_STRONG_INLINE int currentThreadId() const { + return pool_->CurrentThreadId(); + } + + // parallelFor executes f with [0, n) arguments in parallel and waits for + // completion. F accepts a half-open interval [first, last). + // Block size is choosen based on the iteration cost and resulting parallel + // efficiency. If block_align is not nullptr, it is called to round up the + // block size. + void parallelFor(Index n, const TensorOpCost& cost, + std::function block_align, + std::function f) const { + typedef TensorCostModel CostModel; + if (n <= 1 || numThreads() == 1 || + CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { + f(0, n); + return; + } + + // Calculate block size based on (1) the iteration cost and (2) parallel + // efficiency. We want blocks to be not too small to mitigate + // parallelization overheads; not too large to mitigate tail + // effect and potential load imbalance and we also want number + // of blocks to be evenly dividable across threads. + + double block_size_f = 1.0 / CostModel::taskSize(1, cost); + const Index max_oversharding_factor = 4; + Index block_size = numext::mini( + n, numext::maxi(divup(n, max_oversharding_factor * numThreads()), + block_size_f)); + const Index max_block_size = numext::mini(n, 2 * block_size); + if (block_align) { + Index new_block_size = block_align(block_size); + eigen_assert(new_block_size >= block_size); + block_size = numext::mini(n, new_block_size); + } + Index block_count = divup(n, block_size); + // Calculate parallel efficiency as fraction of total CPU time used for + // computations: + double max_efficiency = + static_cast(block_count) / + (divup(block_count, numThreads()) * numThreads()); + // Now try to increase block size up to max_block_size as long as it + // doesn't decrease parallel efficiency. + for (Index prev_block_count = block_count; + max_efficiency < 1.0 && prev_block_count > 1;) { + // This is the next block size that divides size into a smaller number + // of blocks than the current block_size. + Index coarser_block_size = divup(n, prev_block_count - 1); + if (block_align) { + Index new_block_size = block_align(coarser_block_size); + eigen_assert(new_block_size >= coarser_block_size); + coarser_block_size = numext::mini(n, new_block_size); + } + if (coarser_block_size > max_block_size) { + break; // Reached max block size. Stop. + } + // Recalculate parallel efficiency. + const Index coarser_block_count = divup(n, coarser_block_size); + eigen_assert(coarser_block_count < prev_block_count); + prev_block_count = coarser_block_count; + const double coarser_efficiency = + static_cast(coarser_block_count) / + (divup(coarser_block_count, numThreads()) * numThreads()); + if (coarser_efficiency + 0.01 >= max_efficiency) { + // Taking it. + block_size = coarser_block_size; + block_count = coarser_block_count; + if (max_efficiency < coarser_efficiency) { + max_efficiency = coarser_efficiency; + } + } + } + + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + Barrier barrier(static_cast(block_count)); + std::function handleRange; + handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) { + if (last - first <= block_size) { + // Single block or less, execute directly. + f(first, last); + barrier.Notify(); + return; + } + // Split into halves and submit to the pool. + Index mid = first + divup((last - first) / 2, block_size) * block_size; + pool_->Schedule([=, &handleRange]() { handleRange(mid, last); }); + pool_->Schedule([=, &handleRange]() { handleRange(first, mid); }); + }; + handleRange(0, n); + barrier.Wait(); + } + + // Convenience wrapper for parallelFor that does not align blocks. + void parallelFor(Index n, const TensorOpCost& cost, + std::function f) const { + parallelFor(n, cost, nullptr, std::move(f)); + } + + private: + ThreadPoolInterface* pool_; + int num_threads_; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h new file mode 100644 index 00000000..2a4e5b5c --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h @@ -0,0 +1,236 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H +#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H + +namespace Eigen { + +/** \internal + * + * \class TensorDimensionList + * \ingroup CXX11_Tensor_Module + * + * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n. + * + * \sa Tensor + */ + +template struct DimensionList { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + const Index operator[] (const Index i) const { return i; } +}; + +namespace internal { + +template struct array_size > { + static const size_t value = Rank; +}; +template struct array_size > { + static const size_t value = Rank; +}; + +template const Index array_get(DimensionList&) { + return n; +} +template const Index array_get(const DimensionList&) { + return n; +} + + +#if EIGEN_HAS_CONSTEXPR +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { + return true; + } +}; +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { + return true; + } +}; + +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return true; + } +}; +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return true; + } +}; + +template +struct indices_statically_known_to_increase_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return true; + } +}; +template +struct indices_statically_known_to_increase_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return true; + } +}; + +template +struct index_statically_eq_impl > { + static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i == value; + } +}; +template +struct index_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i == value; + } +}; + +template +struct index_statically_ne_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i != value; + } +}; +template +struct index_statically_ne_impl > { + static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i != value; + } +}; + +template +struct index_statically_gt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i > value; + } +}; +template +struct index_statically_gt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i > value; + } +}; + +template +struct index_statically_lt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i < value; + } +}; +template +struct index_statically_lt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i < value; + } +}; + +#else +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { + return true; + } +}; +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { + return true; + } +}; + +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() { + return true; + } +}; +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() { + return true; + } +}; + +template +struct indices_statically_known_to_increase_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { + return true; + } +}; +template +struct indices_statically_known_to_increase_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { + return true; + } +}; + +template +struct index_statically_eq_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; +template +struct index_statically_eq_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; + +template +struct index_statically_ne_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex){ + return false; + } +}; +template +struct index_statically_ne_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; + +template +struct index_statically_gt_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; +template +struct index_statically_gt_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; + +template +struct index_statically_lt_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; +template +struct index_statically_lt_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; +#endif + +} // end namespace internal +} // end namespace Eigen + + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h new file mode 100644 index 00000000..7ffef43e --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -0,0 +1,428 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H + + +namespace Eigen { + +/** \internal + * + * \class TensorDimensions + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode and store the dimensions of a Tensor. + * + * The Sizes class encodes as part of the type the number of dimensions and the + * sizes corresponding to each dimension. It uses no storage space since it is + * entirely known at compile time. + * The DSizes class is its dynamic sibling: the number of dimensions is known + * at compile time but the sizes are set during execution. + * + * \sa Tensor + */ + +// Boilerplate code +namespace internal { + +template struct dget { + static const std::size_t value = get::value; +}; + + +template +struct fixed_size_tensor_index_linearization_helper +{ + template EIGEN_DEVICE_FUNC + static inline Index run(array const& indices, + const Dimensions& dimensions) + { + return array_get(indices) + + dget::value * + fixed_size_tensor_index_linearization_helper::run(indices, dimensions); + } +}; + +template +struct fixed_size_tensor_index_linearization_helper +{ + template EIGEN_DEVICE_FUNC + static inline Index run(array const&, const Dimensions&) + { + return 0; + } +}; + +template +struct fixed_size_tensor_index_extraction_helper +{ + template EIGEN_DEVICE_FUNC + static inline Index run(const Index index, + const Dimensions& dimensions) + { + const Index mult = (index == n-1) ? 1 : 0; + return array_get(dimensions) * mult + + fixed_size_tensor_index_extraction_helper::run(index, dimensions); + } +}; + +template +struct fixed_size_tensor_index_extraction_helper +{ + template EIGEN_DEVICE_FUNC + static inline Index run(const Index, + const Dimensions&) + { + return 0; + } + }; + +} // end namespace internal + + +// Fixed size +#ifndef EIGEN_EMULATE_CXX11_META_H +template +struct Sizes : internal::numeric_list { + typedef internal::numeric_list Base; + static const std::ptrdiff_t total_size = internal::arg_prod(Indices...); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const { + return Base::count; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() { + return internal::arg_prod(Indices...); + } + + EIGEN_DEVICE_FUNC Sizes() { } + template + explicit EIGEN_DEVICE_FUNC Sizes(const array& /*indices*/) { + // todo: add assertion + } +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { } + explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list /*l*/) { + // todo: add assertion + } +#endif + + template Sizes& operator = (const T& /*other*/) { + // add assertion failure if the size of other is different + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const { + return internal::fixed_size_tensor_index_extraction_helper::run(index, *this); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t IndexOfColMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t IndexOfRowMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); + } +}; + +namespace internal { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes&) { + return Sizes::total_size; +} +} + +#else + +template +struct non_zero_size { + typedef internal::type2val type; +}; +template <> +struct non_zero_size<0> { + typedef internal::null_type type; +}; + +template struct Sizes { + typedef typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type Base; + static const size_t count = Base::count; + static const std::size_t total_size = internal::arg_prod::value; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return count; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() { + return internal::arg_prod::value; + } + + Sizes() { } + template + explicit Sizes(const array& /*indices*/) { + // todo: add assertion + } + template Sizes& operator = (const T& /*other*/) { + // add assertion failure if the size of other is different + return *this; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template Sizes(DenseIndex... /*indices*/) { } + explicit Sizes(std::initializer_list) { + // todo: add assertion + } +#else + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) { + } + EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) { + } + EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) { + } + EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { + } + EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index operator[] (const Index index) const { + switch (index) { + case 0: + return internal::get<0, Base>::value; + case 1: + return internal::get<1, Base>::value; + case 2: + return internal::get<2, Base>::value; + case 3: + return internal::get<3, Base>::value; + case 4: + return internal::get<4, Base>::value; + default: + eigen_assert(false && "index overflow"); + return static_cast(-1); + } + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t IndexOfColMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *reinterpret_cast(this)); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t IndexOfRowMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *reinterpret_cast(this)); + } +}; + +namespace internal { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes&) { + return Sizes::total_size; +} +} + +#endif + +// Boilerplate +namespace internal { +template +struct tensor_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, array const& dimensions) + { + return array_get(indices) + + array_get(dimensions) * + tensor_index_linearization_helper::run(indices, dimensions); + } +}; + +template +struct tensor_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, array const&) + { + return array_get(indices); + } +}; +} // end namespace internal + + + +// Dynamic size +template +struct DSizes : array { + typedef array Base; + static const int count = NumDims; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return NumDims; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const { + return (NumDims == 0) ? 1 : internal::array_prod(*static_cast(this)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = 0; + } + } + EIGEN_DEVICE_FUNC explicit DSizes(const array& a) : Base(a) { } + + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { + eigen_assert(NumDims == 1); + (*this)[0] = i0; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) { + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#else + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) { + eigen_assert(NumDims == 2); + (*this)[0] = i0; + (*this)[1] = i1; + } + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + eigen_assert(NumDims == 3); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + } + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + eigen_assert(NumDims == 4); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + } + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + eigen_assert(NumDims == 5); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + (*this)[4] = i4; + } +#endif + + EIGEN_DEVICE_FUNC DSizes& operator = (const array& other) { + *static_cast(this) = other; + return *this; + } + + // A constexpr would be so much better here + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array& indices) const { + return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array& indices) const { + return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); + } +}; + + + + +// Boilerplate +namespace internal { +template +struct tensor_vsize_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, std::vector const& dimensions) + { + return array_get(indices) + + array_get(dimensions) * + tensor_vsize_index_linearization_helper::run(indices, dimensions); + } +}; + +template +struct tensor_vsize_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, std::vector const&) + { + return array_get(indices); + } +}; +} // end namespace internal + + +namespace internal { + +template struct array_size > { + static const size_t value = NumDims; +}; +template struct array_size > { + static const size_t value = NumDims; +}; +#ifndef EIGEN_EMULATE_CXX11_META_H +template struct array_size > { +static const std::ptrdiff_t value = Sizes::count; +}; +template struct array_size > { +static const std::ptrdiff_t value = Sizes::count; +}; +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes&) { + return get >::value; +} +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) { + eigen_assert(false && "should never be called"); + return -1; +} +#else +template struct array_size > { + static const size_t value = Sizes::count; +}; +template struct array_size > { + static const size_t value = Sizes::count; +}; +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes&) { + return get::Base>::value; +} + +#endif + + +template +struct sizes_match_below_dim { + static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { + return false; + } +}; +template +struct sizes_match_below_dim { + static EIGEN_DEVICE_FUNC inline bool run(Dims1& dims1, Dims2& dims2) { + return (array_get(dims1) == array_get(dims2)) & + sizes_match_below_dim::run(dims1, dims2); + } +}; +template +struct sizes_match_below_dim { + static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { + return true; + } +}; + +} // end namespace internal + + +template +EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) { + return internal::sizes_match_below_dim::value, internal::array_size::value>::run(dims1, dims2); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h new file mode 100644 index 00000000..5bd24881 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -0,0 +1,181 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template class MakePointer_> +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0 + }; + template + struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + }; +}; + +template class MakePointer_> +struct eval, Eigen::Dense> +{ + typedef const TensorEvalToOp& type; +}; + +template class MakePointer_> +struct nested, 1, typename eval >::type> +{ + typedef TensorEvalToOp type; +}; + +} // end namespace internal + + + + +template class MakePointer_> +class TensorEvalToOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename MakePointer_::Type PointerType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr) + : m_xpr(expr), m_buffer(buffer) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; } + + protected: + typename XprType::Nested m_xpr; + PointerType m_buffer; +}; + + + +template class MakePointer_> +struct TensorEvaluator, Device> +{ + typedef TensorEvalToOp XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = true + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), + m_buffer(op.buffer()), m_op(op), m_expression(op.expression()) + { } + + // Used for accessor extraction in SYCL Managed TensorMap: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const { + return m_op; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { + } + + typedef typename internal::traits >::template MakePointer::Type DevicePointer; + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) { + EIGEN_UNUSED_VARIABLE(scalar); + eigen_assert(scalar == NULL); + return m_impl.evalSubExprsIfNeeded(m_buffer); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + m_buffer[i] = m_impl.coeff(i); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_buffer[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return internal::ploadt(m_buffer + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here. + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; } + ArgType expression() const { return m_expression; } + + /// required by sycl in order to extract the accessor + const TensorEvaluator& impl() const { return m_impl; } + /// added for sycl in order to construct the buffer from the sycl device + const Device& device() const{return m_device;} + + private: + TensorEvaluator m_impl; + const Device& m_device; + DevicePointer m_buffer; + const XprType& m_op; + const ArgType m_expression; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h new file mode 100644 index 00000000..d087ca4c --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -0,0 +1,633 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor evaluator classes. + * + * These classes are responsible for the evaluation of the tensor expression. + * + * TODO: add support for more types of expressions, in particular expressions + * leading to lvalues (slicing, reshaping, etc...) + */ + +// Generic evaluator +template +struct TensorEvaluator +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + // NumDimensions is -1 for variable dim tensors + static const int NumCoords = internal::traits::NumDimensions > 0 ? + internal::traits::NumDimensions : 0; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = (internal::unpacket_traits::size > 1), + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + RawAccess = true + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(const_cast::template MakePointer::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m) + { } + + // Used for accessor extraction in SYCL Managed TensorMap: + const Derived& derived() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { + if (dest) { + m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); + return false; + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data); + return m_data[index]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + eigen_assert(m_data); + return m_data[index]; + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + return internal::ploadt(m_data + index); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + return internal::pstoret(m_data + index, x); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { + eigen_assert(m_data); + if (static_cast(Layout) == static_cast(ColMajor)) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& coords) { + eigen_assert(m_data); + if (static_cast(Layout) == static_cast(ColMajor)) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits::size); + } + + EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } + + /// required by sycl in order to construct sycl buffer from raw pointer + const Device& device() const{return m_device;} + + protected: + typename internal::traits::template MakePointer::Type m_data; + Dimensions m_dims; + const Device& m_device; + const Derived& m_impl; +}; + +namespace { +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T loadConstant(const T* address) { + return *address; +} +// Use the texture cache on CUDA devices whenever possible +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float loadConstant(const float* address) { + return __ldg(address); +} +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double loadConstant(const double* address) { + return __ldg(address); +} +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +Eigen::half loadConstant(const Eigen::half* address) { + return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); +} +#endif +} + + +// Default evaluator for rvalues +template +struct TensorEvaluator +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + // NumDimensions is -1 for variable dim tensors + static const int NumCoords = internal::traits::NumDimensions > 0 ? + internal::traits::NumDimensions : 0; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = (internal::unpacket_traits::size > 1), + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + RawAccess = true + }; + + // Used for accessor extraction in SYCL Managed TensorMap: + const Derived& derived() const { return m_impl; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + if (!NumTraits::type>::RequireInitialization && data) { + m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar)); + return false; + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data); + return loadConstant(m_data+index); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + return internal::ploadt_ro(m_data + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { + eigen_assert(m_data); + const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords) + : m_dims.IndexOfRowMajor(coords); + return loadConstant(m_data+index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits::size); + } + + EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } + + /// added for sycl in order to construct the buffer from the sycl device + const Device& device() const{return m_device;} + + protected: + typename internal::traits::template MakePointer::Type m_data; + Dimensions m_dims; + const Device& m_device; + const Derived& m_impl; +}; + + + + +// -------------------- CwiseNullaryOp -------------------- + +template +struct TensorEvaluator, Device> +{ + typedef TensorCwiseNullaryOp XprType; + + enum { + IsAligned = true, + PacketAccess = internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC + TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_wrapper(m_functor, index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_wrapper.template packetOp(m_functor, index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits::size); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + /// required by sycl in order to extract the accessor + const TensorEvaluator& impl() const { return m_argImpl; } + /// required by sycl in order to extract the accessor + NullaryOp functor() const { return m_functor; } + + + private: + const NullaryOp m_functor; + TensorEvaluator m_argImpl; + const internal::nullary_wrapper m_wrapper; +}; + + + +// -------------------- CwiseUnaryOp -------------------- + +template +struct TensorEvaluator, Device> +{ + typedef TensorCwiseUnaryOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), + m_argImpl(op.nestedExpression(), device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_argImpl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_argImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(m_argImpl.coeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_argImpl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_argImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + /// required by sycl in order to extract the accessor + const TensorEvaluator & impl() const { return m_argImpl; } + /// added for sycl in order to construct the buffer from sycl device + UnaryOp functor() const { return m_functor; } + + + private: + const UnaryOp m_functor; + TensorEvaluator m_argImpl; +}; + + +// -------------------- CwiseBinaryOp -------------------- + +template +struct TensorEvaluator, Device> +{ + typedef TensorCwiseBinaryOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & + internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || internal::traits::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use right impl instead if right impl dimensions are known at compile time. + return m_leftImpl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_leftImpl.template packet(index), m_rightImpl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_leftImpl.costPerCoeff(vectorized) + + m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& left_impl() const { return m_leftImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& right_impl() const { return m_rightImpl; } + /// required by sycl in order to extract the accessor + BinaryOp functor() const { return m_functor; } + + private: + const BinaryOp m_functor; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; +}; + +// -------------------- CwiseTernaryOp -------------------- + +template +struct TensorEvaluator, Device> +{ + typedef TensorCwiseTernaryOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & + internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), + m_arg1Impl(op.arg1Expression(), device), + m_arg2Impl(op.arg2Expression(), device), + m_arg3Impl(op.arg3Expression(), device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || internal::traits::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, + typename internal::traits::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, + typename internal::traits::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::Index, + typename internal::traits::Index>::value), + STORAGE_INDEX_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::Index, + typename internal::traits::Index>::value), + STORAGE_INDEX_MUST_MATCH) + + eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use arg2 or arg3 dimensions if they are known at compile time. + return m_arg1Impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_arg1Impl.evalSubExprsIfNeeded(NULL); + m_arg2Impl.evalSubExprsIfNeeded(NULL); + m_arg3Impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_arg1Impl.cleanup(); + m_arg2Impl.cleanup(); + m_arg3Impl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_arg1Impl.template packet(index), + m_arg2Impl.template packet(index), + m_arg3Impl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_arg1Impl.costPerCoeff(vectorized) + + m_arg2Impl.costPerCoeff(vectorized) + + m_arg3Impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + /// required by sycl in order to extract the accessor + const TensorEvaluator & arg1Impl() const { return m_arg1Impl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& arg2Impl() const { return m_arg2Impl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& arg3Impl() const { return m_arg3Impl; } + + private: + const TernaryOp m_functor; + TensorEvaluator m_arg1Impl; + TensorEvaluator m_arg2Impl; + TensorEvaluator m_arg3Impl; +}; + + +// -------------------- SelectOp -------------------- + +template +struct TensorEvaluator, Device> +{ + typedef TensorSelectOp XprType; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & + internal::packet_traits::HasBlend, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_condImpl(op.ifExpression(), device), + m_thenImpl(op.thenExpression(), device), + m_elseImpl(op.elseExpression(), device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); + eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use then or else impl instead if they happen to be known at compile time. + return m_condImpl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_condImpl.evalSubExprsIfNeeded(NULL); + m_thenImpl.evalSubExprsIfNeeded(NULL); + m_elseImpl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_condImpl.cleanup(); + m_thenImpl.cleanup(); + m_elseImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); + } + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + internal::Selector select; + for (Index i = 0; i < PacketSize; ++i) { + select.select[i] = m_condImpl.coeff(index+i); + } + return internal::pblend(select, + m_thenImpl.template packet(index), + m_elseImpl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return m_condImpl.costPerCoeff(vectorized) + + m_thenImpl.costPerCoeff(vectorized) + .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; } + /// required by sycl in order to extract the accessor + const TensorEvaluator & cond_impl() const { return m_condImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& then_impl() const { return m_thenImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& else_impl() const { return m_elseImpl; } + + private: + TensorEvaluator m_condImpl; + TensorEvaluator m_thenImpl; + TensorEvaluator m_elseImpl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h new file mode 100644 index 00000000..77f9c7c5 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -0,0 +1,288 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H + +namespace Eigen { + +/** \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + */ +namespace internal { + +// Default strategy: the expression is evaluated with a single cpu thread. +template +class TensorExecutor +{ + public: + typedef typename Expression::Index Index; + EIGEN_DEVICE_FUNC + static inline void run(const Expression& expr, const Device& device = Device()) + { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); + for (Index i = 0; i < size; ++i) { + evaluator.evalScalar(i); + } + } + evaluator.cleanup(); + } +}; + + +template +class TensorExecutor +{ + public: + typedef typename Expression::Index Index; + EIGEN_DEVICE_FUNC + static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) + { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); + const int PacketSize = unpacket_traits::PacketReturnType>::size; + // Give the compiler a strong hint to unroll the loop. But don't insist + // on unrolling, because if the function is expensive the compiler should not + // unroll the loop at the expense of inlining. + const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; + for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { + for (Index j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } + } + const Index VectorizedSize = (size / PacketSize) * PacketSize; + for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { + evaluator.evalPacket(i); + } + for (Index i = VectorizedSize; i < size; ++i) { + evaluator.evalScalar(i); + } + } + evaluator.cleanup(); + } +}; + + + +// Multicore strategy: the index space is partitioned and each partition is executed on a single core +#ifdef EIGEN_USE_THREADS +template +struct EvalRange { + static void run(Evaluator* evaluator_in, const Index first, const Index last) { + Evaluator evaluator = *evaluator_in; + eigen_assert(last >= first); + for (Index i = first; i < last; ++i) { + evaluator.evalScalar(i); + } + } + + static Index alignBlockSize(Index size) { + return size; + } +}; + +template +struct EvalRange { + static const int PacketSize = unpacket_traits::size; + + static void run(Evaluator* evaluator_in, const Index first, const Index last) { + Evaluator evaluator = *evaluator_in; + eigen_assert(last >= first); + Index i = first; + if (last - first >= PacketSize) { + eigen_assert(first % PacketSize == 0); + Index last_chunk_offset = last - 4 * PacketSize; + // Give the compiler a strong hint to unroll the loop. But don't insist + // on unrolling, because if the function is expensive the compiler should not + // unroll the loop at the expense of inlining. + for (; i <= last_chunk_offset; i += 4*PacketSize) { + for (Index j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } + } + last_chunk_offset = last - PacketSize; + for (; i <= last_chunk_offset; i += PacketSize) { + evaluator.evalPacket(i); + } + } + for (; i < last; ++i) { + evaluator.evalScalar(i); + } + } + + static Index alignBlockSize(Index size) { + // Align block size to packet size and account for unrolling in run above. + if (size >= 16 * PacketSize) { + return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); + } + // Aligning to 4 * PacketSize would increase block size by more than 25%. + return (size + PacketSize - 1) & ~(PacketSize - 1); + } +}; + +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const ThreadPoolDevice& device) + { + typedef TensorEvaluator Evaluator; + Evaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); +#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) + device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), + EvalRange::alignBlockSize, + [&evaluator](Index first, Index last) { + EvalRange::run(&evaluator, first, last); + }); +#else + size_t num_threads = device.numThreads(); + if (num_threads > 1) { + num_threads = TensorCostModel::numThreads( + size, evaluator.costPerCoeff(Vectorizable), num_threads); + } + if (num_threads == 1) { + EvalRange::run(&evaluator, 0, size); + } else { + const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; + Index blocksz = std::ceil(static_cast(size)/num_threads) + PacketSize - 1; + const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + Barrier barrier(numblocks); + for (int i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier( + &barrier, &EvalRange::run, + &evaluator, i * blocksize, (i + 1) * blocksize); + } + if (numblocks * blocksize < size) { + EvalRange::run( + &evaluator, numblocks * blocksize, size); + } + barrier.Wait(); + } +#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) + } + evaluator.cleanup(); + } +}; +#endif // EIGEN_USE_THREADS + + +// GPU: the evaluation of the expression is offloaded to a GPU. +#if defined(EIGEN_USE_GPU) + +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + static void run(const Expression& expr, const GpuDevice& device); +}; + + +#if defined(__CUDACC__) +template +struct EigenMetaKernelEval { + static __device__ EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, Index first, Index last, Index step_size) { + for (Index i = first; i < last; i += step_size) { + eval.evalScalar(i); + } + } +}; + +template +struct EigenMetaKernelEval { + static __device__ EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, Index first, Index last, Index step_size) { + const Index PacketSize = unpacket_traits::size; + const Index vectorized_size = (last / PacketSize) * PacketSize; + const Index vectorized_step_size = step_size * PacketSize; + + // Use the vector path + for (Index i = first * PacketSize; i < vectorized_size; + i += vectorized_step_size) { + eval.evalPacket(i); + } + for (Index i = vectorized_size + first; i < last; i += step_size) { + eval.evalScalar(i); + } + } +}; + +template +__global__ void +__launch_bounds__(1024) +EigenMetaKernel(Evaluator eval, Index size) { + + const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; + const Index step_size = blockDim.x * gridDim.x; + + const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; + EigenMetaKernelEval::run(eval, first_index, size, step_size); +} + +/*static*/ +template +inline void TensorExecutor::run( + const Expression& expr, const GpuDevice& device) { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + const int block_size = device.maxCudaThreadsPerBlock(); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; + const Index size = array_prod(evaluator.dimensions()); + // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. + const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); + + LAUNCH_CUDA_KERNEL( + (EigenMetaKernel, Index>), + num_blocks, block_size, 0, device, evaluator, size); + } + evaluator.cleanup(); +} + +#endif // __CUDACC__ +#endif // EIGEN_USE_GPU + +// SYCL Executor policy +#ifdef EIGEN_USE_SYCL + +template +class TensorExecutor { +public: + static inline void run(const Expression &expr, const SyclDevice &device) { + // call TensorSYCL module + TensorSycl::run(expr, device); + } +}; + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h new file mode 100644 index 00000000..5267b70d --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -0,0 +1,371 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H + +namespace Eigen { + +/** \class TensorExpr + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor expression classes. + * + * The TensorCwiseNullaryOp class applies a nullary operators to an expression. + * This is typically used to generate constants. + * + * The TensorCwiseUnaryOp class represents an expression where a unary operator + * (e.g. cwiseSqrt) is applied to an expression. + * + * The TensorCwiseBinaryOp class represents an expression where a binary + * operator (e.g. addition) is applied to a lhs and a rhs expression. + * + */ +namespace internal { +template +struct traits > + : traits +{ + typedef traits XprTraits; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::Nested XprTypeNested; + typedef typename remove_reference::type _XprTypeNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0 + }; +}; + +} // end namespace internal + + + +template +class TensorCwiseNullaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef TensorCwiseNullaryOp Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + const NullaryOp& functor() const { return m_functor; } + + protected: + typename XprType::Nested m_xpr; + const NullaryOp m_functor; +}; + + + +namespace internal { +template +struct traits > + : traits +{ + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. + typedef typename result_of::type Scalar; + typedef traits XprTraits; + typedef typename XprType::Nested XprTypeNested; + typedef typename remove_reference::type _XprTypeNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCwiseUnaryOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorCwiseUnaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCwiseUnaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const UnaryOp& functor() const { return m_functor; } + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const UnaryOp m_functor; +}; + + +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs + // are different. + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. + typedef typename result_of< + BinaryOp(typename LhsXprType::Scalar, + typename RhsXprType::Scalar)>::type Scalar; + typedef traits XprTraits; + typedef typename promote_storage_type< + typename traits::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type< + typename traits::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCwiseBinaryOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorCwiseBinaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCwiseBinaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const BinaryOp& functor() const { return m_functor; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const BinaryOp m_functor; +}; + + +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the args are different. + typedef typename result_of< + TernaryOp(typename Arg1XprType::Scalar, + typename Arg2XprType::Scalar, + typename Arg3XprType::Scalar)>::type Scalar; + typedef traits XprTraits; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename Arg1XprType::Nested Arg1Nested; + typedef typename Arg2XprType::Nested Arg2Nested; + typedef typename Arg3XprType::Nested Arg3Nested; + typedef typename remove_reference::type _Arg1Nested; + typedef typename remove_reference::type _Arg2Nested; + typedef typename remove_reference::type _Arg3Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCwiseTernaryOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorCwiseTernaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCwiseTernaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp()) + : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const TernaryOp& functor() const { return m_functor; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + arg1Expression() const { return m_arg1_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + arg2Expression() const { return m_arg2_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + arg3Expression() const { return m_arg3_xpr; } + + protected: + typename Arg1XprType::Nested m_arg1_xpr; + typename Arg2XprType::Nested m_arg2_xpr; + typename Arg3XprType::Nested m_arg3_xpr; + const TernaryOp m_functor; +}; + + +namespace internal { +template +struct traits > + : traits +{ + typedef typename traits::Scalar Scalar; + typedef traits XprTraits; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename IfXprType::Nested IfNested; + typedef typename ThenXprType::Nested ThenNested; + typedef typename ElseXprType::Nested ElseNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorSelectOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorSelectOp type; +}; + +} // end namespace internal + + +template +class TensorSelectOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC + TensorSelectOp(const IfXprType& a_condition, + const ThenXprType& a_then, + const ElseXprType& a_else) + : m_condition(a_condition), m_then(a_then), m_else(a_else) + { } + + EIGEN_DEVICE_FUNC + const IfXprType& ifExpression() const { return m_condition; } + + EIGEN_DEVICE_FUNC + const ThenXprType& thenExpression() const { return m_then; } + + EIGEN_DEVICE_FUNC + const ElseXprType& elseExpression() const { return m_else; } + + protected: + typename IfXprType::Nested m_condition; + typename ThenXprType::Nested m_then; + typename ElseXprType::Nested m_else; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h new file mode 100644 index 00000000..473e4940 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -0,0 +1,651 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Jianwei Cui +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H +#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H + +// This code requires the ability to initialize arrays of constant +// values directly inside a class. +#if __cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1900 + +namespace Eigen { + +/** \class TensorFFT + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor FFT class. + * + * TODO: + * Vectorize the Cooley Tukey and the Bluestein algorithm + * Add support for multithreaded evaluation + * Improve the performance on GPU + */ + +template struct MakeComplex { + template + EIGEN_DEVICE_FUNC + T operator() (const T& val) const { return val; } +}; + +template <> struct MakeComplex { + template + EIGEN_DEVICE_FUNC + std::complex operator() (const T& val) const { return std::complex(val, 0); } +}; + +template <> struct MakeComplex { + template + EIGEN_DEVICE_FUNC + std::complex operator() (const std::complex& val) const { return val; } +}; + +template struct PartOf { + template T operator() (const T& val) const { return val; } +}; + +template <> struct PartOf { + template T operator() (const std::complex& val) const { return val.real(); } +}; + +template <> struct PartOf { + template T operator() (const std::complex& val) const { return val.imag(); } +}; + +namespace internal { +template +struct traits > : public traits { + typedef traits XprTraits; + typedef typename NumTraits::Real RealScalar; + typedef typename std::complex ComplexScalar; + typedef typename XprTraits::Scalar InputScalar; + typedef typename conditional::type OutputScalar; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorFFTOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorFFTOp type; +}; + +} // end namespace internal + +template +class TensorFFTOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename std::complex ComplexScalar; + typedef typename internal::conditional::type OutputScalar; + typedef OutputScalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft) + : m_xpr(expr), m_fft(fft) {} + + EIGEN_DEVICE_FUNC + const FFT& fft() const { return m_fft; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& expression() const { + return m_xpr; + } + + protected: + typename XprType::Nested m_xpr; + const FFT m_fft; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorFFTOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename std::complex ComplexScalar; + typedef typename TensorEvaluator::Dimensions InputDimensions; + typedef internal::traits XprTraits; + typedef typename XprTraits::Scalar InputScalar; + typedef typename internal::conditional::type OutputScalar; + typedef OutputScalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = false, + PacketAccess = true, + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + eigen_assert(input_dims[i] > 0); + m_dimensions[i] = input_dims[i]; + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + } + } + m_size = m_dimensions.TotalSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_dimensions; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) { + m_impl.evalSubExprsIfNeeded(NULL); + if (data) { + evalToBuf(data); + return false; + } else { + m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size); + evalToBuf(m_data); + return true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + if (m_data) { + m_device.deallocate(m_data); + m_data = NULL; + } + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { + return m_data[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType + packet(Index index) const { + return internal::ploadt(m_data + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } + + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) { + const bool write_to_out = internal::is_same::value; + ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size); + + for (Index i = 0; i < m_size; ++i) { + buf[i] = MakeComplex::value>()(m_impl.coeff(i)); + } + + for (size_t i = 0; i < m_fft.size(); ++i) { + Index dim = m_fft[i]; + eigen_assert(dim >= 0 && dim < NumDims); + Index line_len = m_dimensions[dim]; + eigen_assert(line_len >= 1); + ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len); + const bool is_power_of_two = isPowerOfTwo(line_len); + const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len); + const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite); + + ComplexScalar* a = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); + ComplexScalar* b = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); + ComplexScalar* pos_j_base_powered = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1)); + if (!is_power_of_two) { + // Compute twiddle factors + // t_n = exp(sqrt(-1) * pi * n^2 / line_len) + // for n = 0, 1,..., line_len-1. + // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 + pos_j_base_powered[0] = ComplexScalar(1, 0); + if (line_len > 1) { + const RealScalar pi_over_len(EIGEN_PI / line_len); + const ComplexScalar pos_j_base = ComplexScalar( + std::cos(pi_over_len), std::sin(pi_over_len)); + pos_j_base_powered[1] = pos_j_base; + if (line_len > 2) { + const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; + for (int j = 2; j < line_len + 1; ++j) { + pos_j_base_powered[j] = pos_j_base_powered[j - 1] * + pos_j_base_powered[j - 1] / + pos_j_base_powered[j - 2] * pos_j_base_sq; + } + } + } + } + + for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) { + const Index base_offset = getBaseOffsetFromIndex(partial_index, dim); + + // get data into line_buf + const Index stride = m_strides[dim]; + if (stride == 1) { + memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); + } else { + Index offset = base_offset; + for (int j = 0; j < line_len; ++j, offset += stride) { + line_buf[j] = buf[offset]; + } + } + + // processs the line + if (is_power_of_two) { + processDataLineCooleyTukey(line_buf, line_len, log_len); + } + else { + processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered); + } + + // write back + if (FFTDir == FFT_FORWARD && stride == 1) { + memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); + } else { + Index offset = base_offset; + const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); + for (int j = 0; j < line_len; ++j, offset += stride) { + buf[offset] = (FFTDir == FFT_FORWARD) ? line_buf[j] : line_buf[j] * div_factor; + } + } + } + m_device.deallocate(line_buf); + if (!is_power_of_two) { + m_device.deallocate(a); + m_device.deallocate(b); + m_device.deallocate(pos_j_base_powered); + } + } + + if(!write_to_out) { + for (Index i = 0; i < m_size; ++i) { + data[i] = PartOf()(buf[i]); + } + m_device.deallocate(buf); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) { + eigen_assert(x > 0); + return !(x & (x - 1)); + } + + // The composite number for padding, used in Bluestein's FFT algorithm + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) { + Index i = 2; + while (i < 2 * n - 1) i *= 2; + return i; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) { + Index log2m = 0; + while (m >>= 1) log2m++; + return log2m; + } + + // Call Cooley Tukey algorithm directly, data length must be power of 2 + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len, Index log_len) { + eigen_assert(isPowerOfTwo(line_len)); + scramble_FFT(line_buf, line_len); + compute_1D_Butterfly(line_buf, line_len, log_len); + } + + // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len, Index good_composite, Index log_len, ComplexScalar* a, ComplexScalar* b, const ComplexScalar* pos_j_base_powered) { + Index n = line_len; + Index m = good_composite; + ComplexScalar* data = line_buf; + + for (Index i = 0; i < n; ++i) { + if(FFTDir == FFT_FORWARD) { + a[i] = data[i] * numext::conj(pos_j_base_powered[i]); + } + else { + a[i] = data[i] * pos_j_base_powered[i]; + } + } + for (Index i = n; i < m; ++i) { + a[i] = ComplexScalar(0, 0); + } + + for (Index i = 0; i < n; ++i) { + if(FFTDir == FFT_FORWARD) { + b[i] = pos_j_base_powered[i]; + } + else { + b[i] = numext::conj(pos_j_base_powered[i]); + } + } + for (Index i = n; i < m - n; ++i) { + b[i] = ComplexScalar(0, 0); + } + for (Index i = m - n; i < m; ++i) { + if(FFTDir == FFT_FORWARD) { + b[i] = pos_j_base_powered[m-i]; + } + else { + b[i] = numext::conj(pos_j_base_powered[m-i]); + } + } + + scramble_FFT(a, m); + compute_1D_Butterfly(a, m, log_len); + + scramble_FFT(b, m); + compute_1D_Butterfly(b, m, log_len); + + for (Index i = 0; i < m; ++i) { + a[i] *= b[i]; + } + + scramble_FFT(a, m); + compute_1D_Butterfly(a, m, log_len); + + //Do the scaling after ifft + for (Index i = 0; i < m; ++i) { + a[i] /= m; + } + + for (Index i = 0; i < n; ++i) { + if(FFTDir == FFT_FORWARD) { + data[i] = a[i] * numext::conj(pos_j_base_powered[i]); + } + else { + data[i] = a[i] * pos_j_base_powered[i]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) { + eigen_assert(isPowerOfTwo(n)); + Index j = 1; + for (Index i = 1; i < n; ++i){ + if (j > i) { + std::swap(data[j-1], data[i-1]); + } + Index m = n >> 1; + while (m >= 2 && j > m) { + j -= m; + m >>= 1; + } + j += m; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_2(ComplexScalar* data) { + ComplexScalar tmp = data[1]; + data[1] = data[0] - data[1]; + data[0] += tmp; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_4(ComplexScalar* data) { + ComplexScalar tmp[4]; + tmp[0] = data[0] + data[1]; + tmp[1] = data[0] - data[1]; + tmp[2] = data[2] + data[3]; + if (Dir == FFT_FORWARD) { + tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]); + } else { + tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]); + } + data[0] = tmp[0] + tmp[2]; + data[1] = tmp[1] + tmp[3]; + data[2] = tmp[0] - tmp[2]; + data[3] = tmp[1] - tmp[3]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_8(ComplexScalar* data) { + ComplexScalar tmp_1[8]; + ComplexScalar tmp_2[8]; + + tmp_1[0] = data[0] + data[1]; + tmp_1[1] = data[0] - data[1]; + tmp_1[2] = data[2] + data[3]; + if (Dir == FFT_FORWARD) { + tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1); + } else { + tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1); + } + tmp_1[4] = data[4] + data[5]; + tmp_1[5] = data[4] - data[5]; + tmp_1[6] = data[6] + data[7]; + if (Dir == FFT_FORWARD) { + tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1); + } else { + tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1); + } + tmp_2[0] = tmp_1[0] + tmp_1[2]; + tmp_2[1] = tmp_1[1] + tmp_1[3]; + tmp_2[2] = tmp_1[0] - tmp_1[2]; + tmp_2[3] = tmp_1[1] - tmp_1[3]; + tmp_2[4] = tmp_1[4] + tmp_1[6]; +// SQRT2DIV2 = sqrt(2)/2 +#define SQRT2DIV2 0.7071067811865476 + if (Dir == FFT_FORWARD) { + tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2); + tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1); + tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2); + } else { + tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2); + tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1); + tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2); + } + data[0] = tmp_2[0] + tmp_2[4]; + data[1] = tmp_2[1] + tmp_2[5]; + data[2] = tmp_2[2] + tmp_2[6]; + data[3] = tmp_2[3] + tmp_2[7]; + data[4] = tmp_2[0] - tmp_2[4]; + data[5] = tmp_2[1] - tmp_2[5]; + data[6] = tmp_2[2] - tmp_2[6]; + data[7] = tmp_2[3] - tmp_2[7]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge( + ComplexScalar* data, Index n, Index n_power_of_2) { + // Original code: + // RealScalar wtemp = std::sin(M_PI/n); + // RealScalar wpi = -std::sin(2 * M_PI/n); + const RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2]; + const RealScalar wpi = (Dir == FFT_FORWARD) + ? m_minus_sin_2_PI_div_n_LUT[n_power_of_2] + : -m_minus_sin_2_PI_div_n_LUT[n_power_of_2]; + + const ComplexScalar wp(wtemp, wpi); + const ComplexScalar wp_one = wp + ComplexScalar(1, 0); + const ComplexScalar wp_one_2 = wp_one * wp_one; + const ComplexScalar wp_one_3 = wp_one_2 * wp_one; + const ComplexScalar wp_one_4 = wp_one_3 * wp_one; + const Index n2 = n / 2; + ComplexScalar w(1.0, 0.0); + for (Index i = 0; i < n2; i += 4) { + ComplexScalar temp0(data[i + n2] * w); + ComplexScalar temp1(data[i + 1 + n2] * w * wp_one); + ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2); + ComplexScalar temp3(data[i + 3 + n2] * w * wp_one_3); + w = w * wp_one_4; + + data[i + n2] = data[i] - temp0; + data[i] += temp0; + + data[i + 1 + n2] = data[i + 1] - temp1; + data[i + 1] += temp1; + + data[i + 2 + n2] = data[i + 2] - temp2; + data[i + 2] += temp2; + + data[i + 3 + n2] = data[i + 3] - temp3; + data[i + 3] += temp3; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly( + ComplexScalar* data, Index n, Index n_power_of_2) { + eigen_assert(isPowerOfTwo(n)); + if (n > 8) { + compute_1D_Butterfly(data, n / 2, n_power_of_2 - 1); + compute_1D_Butterfly(data + n / 2, n / 2, n_power_of_2 - 1); + butterfly_1D_merge(data, n, n_power_of_2); + } else if (n == 8) { + butterfly_8(data); + } else if (n == 4) { + butterfly_4(data); + } else if (n == 2) { + butterfly_2(data); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const { + Index result = 0; + + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > omitted_dim; --i) { + const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; + const Index idx = index / partial_m_stride; + index -= idx * partial_m_stride; + result += idx * m_strides[i]; + } + result += index; + } + else { + for (Index i = 0; i < omitted_dim; ++i) { + const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; + const Index idx = index / partial_m_stride; + index -= idx * partial_m_stride; + result += idx * m_strides[i]; + } + result += index; + } + // Value of index_coords[omitted_dim] is not determined to this step + return result; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const { + Index result = base + offset * m_strides[omitted_dim] ; + return result; + } + + protected: + Index m_size; + const FFT& m_fft; + Dimensions m_dimensions; + array m_strides; + TensorEvaluator m_impl; + CoeffReturnType* m_data; + const Device& m_device; + + // This will support a maximum FFT size of 2^32 for each dimension + // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; + const RealScalar m_sin_PI_div_n_LUT[32] = { + RealScalar(0.0), + RealScalar(-2), + RealScalar(-0.999999999999999), + RealScalar(-0.292893218813453), + RealScalar(-0.0761204674887130), + RealScalar(-0.0192147195967696), + RealScalar(-0.00481527332780311), + RealScalar(-0.00120454379482761), + RealScalar(-3.01181303795779e-04), + RealScalar(-7.52981608554592e-05), + RealScalar(-1.88247173988574e-05), + RealScalar(-4.70619042382852e-06), + RealScalar(-1.17654829809007e-06), + RealScalar(-2.94137117780840e-07), + RealScalar(-7.35342821488550e-08), + RealScalar(-1.83835707061916e-08), + RealScalar(-4.59589268710903e-09), + RealScalar(-1.14897317243732e-09), + RealScalar(-2.87243293150586e-10), + RealScalar( -7.18108232902250e-11), + RealScalar(-1.79527058227174e-11), + RealScalar(-4.48817645568941e-12), + RealScalar(-1.12204411392298e-12), + RealScalar(-2.80511028480785e-13), + RealScalar(-7.01277571201985e-14), + RealScalar(-1.75319392800498e-14), + RealScalar(-4.38298482001247e-15), + RealScalar(-1.09574620500312e-15), + RealScalar(-2.73936551250781e-16), + RealScalar(-6.84841378126949e-17), + RealScalar(-1.71210344531737e-17), + RealScalar(-4.28025861329343e-18) + }; + + // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i)); + const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = { + RealScalar(0.0), + RealScalar(0.0), + RealScalar(-1.00000000000000e+00), + RealScalar(-7.07106781186547e-01), + RealScalar(-3.82683432365090e-01), + RealScalar(-1.95090322016128e-01), + RealScalar(-9.80171403295606e-02), + RealScalar(-4.90676743274180e-02), + RealScalar(-2.45412285229123e-02), + RealScalar(-1.22715382857199e-02), + RealScalar(-6.13588464915448e-03), + RealScalar(-3.06795676296598e-03), + RealScalar(-1.53398018628477e-03), + RealScalar(-7.66990318742704e-04), + RealScalar(-3.83495187571396e-04), + RealScalar(-1.91747597310703e-04), + RealScalar(-9.58737990959773e-05), + RealScalar(-4.79368996030669e-05), + RealScalar(-2.39684498084182e-05), + RealScalar(-1.19842249050697e-05), + RealScalar(-5.99211245264243e-06), + RealScalar(-2.99605622633466e-06), + RealScalar(-1.49802811316901e-06), + RealScalar(-7.49014056584716e-07), + RealScalar(-3.74507028292384e-07), + RealScalar(-1.87253514146195e-07), + RealScalar(-9.36267570730981e-08), + RealScalar(-4.68133785365491e-08), + RealScalar(-2.34066892682746e-08), + RealScalar(-1.17033446341373e-08), + RealScalar(-5.85167231706864e-09), + RealScalar(-2.92583615853432e-09) + }; +}; + +} // end namespace Eigen + +#endif // EIGEN_HAS_CONSTEXPR + + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FFT_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h new file mode 100644 index 00000000..5a41a867 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -0,0 +1,389 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H +#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H + +namespace Eigen { + +/** \class TensorFixedSize + * \ingroup CXX11_Tensor_Module + * + * \brief The fixed sized version of the tensor class. + * + * The fixed sized equivalent of + * Eigen::Tensor t(3, 5, 7); + * is + * Eigen::TensorFixedSize> t; + */ + +template +class TensorFixedSize : public TensorBase > +{ + public: + typedef TensorFixedSize Self; + typedef TensorBase > Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef Scalar_ Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + static const int Options = Options_; + + enum { + IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0), + Layout = Options_ & RowMajor ? RowMajor : ColMajor, + CoordAccess = true, + RawAccess = true + }; + + typedef Dimensions_ Dimensions; + static const std::size_t NumIndices = Dimensions::count; + + protected: + TensorStorage m_storage; + + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + + // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // work, because that uses base().coeffRef() - and we don't yet + // implement a similar class hierarchy + inline Self& base() { return *this; } + inline const Self& base() const { return *this; } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array{{firstIndex, otherIndices...}}); + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + { + if (Options&RowMajor) { + const Index index = i1 + i0 * m_storage.dimensions()[1]; + return m_storage.data()[index]; + } else { + const Index index = i0 + i1 * m_storage.dimensions()[0]; + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + { + if (Options&RowMajor) { + const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2); + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + { + if (Options&RowMajor) { + const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3)); + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + if (Options&RowMajor) { + const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0))); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4))); + return m_storage.data()[index]; + } + } +#endif + + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const + { + eigen_assert(checkIndexRange(indices)); + return coeff(indices); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return coeff(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const + { + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(index); + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array{{firstIndex, otherIndices...}}); + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + { + if (Options&RowMajor) { + const Index index = i1 + i0 * m_storage.dimensions()[1]; + return m_storage.data()[index]; + } else { + const Index index = i0 + i1 * m_storage.dimensions()[0]; + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + { + if (Options&RowMajor) { + const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2); + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + if (Options&RowMajor) { + const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3)); + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + if (Options&RowMajor) { + const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0))); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4))); + return m_storage.data()[index]; + } + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + { + eigen_assert(checkIndexRange(indices)); + return coeffRef(indices); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeffRef(); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator[](Index index) + { + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize() + : m_storage() + { + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize(const Self& other) + : m_storage(other.m_storage) + { + } + +#if EIGEN_HAS_RVALUE_REFERENCES + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other) + : m_storage(other.m_storage) + { + } +#endif + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize& operator=(const TensorFixedSize& other) + { + // FIXME: check that the dimensions of other match the dimensions of *this. + // Unfortunately this isn't possible yet when the rhs is an expression. + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other) + { + // FIXME: check that the dimensions of other match the dimensions of *this. + // Unfortunately this isn't possible yet when the rhs is an expression. + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE bool checkIndexRange(const array& /*indices*/) const + { + using internal::array_apply_and_reduce; + using internal::array_zip_and_reduce; + using internal::greater_equal_zero_op; + using internal::logical_and_op; + using internal::lesser_op; + + return true; + // check whether the indices are all >= 0 + /* array_apply_and_reduce(indices) && + // check whether the indices fit in the dimensions + array_zip_and_reduce(indices, m_storage.dimensions());*/ + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const + { + if (Options&RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h new file mode 100644 index 00000000..53b915da --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -0,0 +1,169 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H + +namespace Eigen { + +namespace internal { +template class MakePointer_> +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0 + }; + template struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + }; +}; + +template class MakePointer_> +struct eval, Eigen::Dense> +{ + typedef const TensorForcedEvalOp& type; +}; + +template class MakePointer_> +struct nested, 1, typename eval >::type> +{ + typedef TensorForcedEvalOp type; +}; + +} // end namespace internal + + + +// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_) + +/** \class TensorForcedEvalOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +/// `template class MakePointer_` is added to convert the host pointer to the device pointer. +/// It is added due to the fact that for our device compiler `T*` is not allowed. +/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`. +/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_` is `T*` . +/// Therefore, by adding the default value, we managed to convert the type and it does not break any +/// existing code as its default value is `T*`. +template class MakePointer_> +class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + + +template class MakePointer_> +struct TensorEvaluator, Device> +{ + typedef TensorForcedEvalOp XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = true, + PacketAccess = (PacketSize > 1), + Layout = TensorEvaluator::Layout, + RawAccess = true + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + /// op_ is used for sycl + : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) + { } + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + const Index numValues = internal::array_prod(m_impl.dimensions()); + m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); + // Should initialize the memory in case we're dealing with non POD types. + if (NumTraits::RequireInitialization) { + for (Index i = 0; i < numValues; ++i) { + new(m_buffer+i) CoeffReturnType(); + } + } + typedef TensorEvalToOp< const typename internal::remove_const::type > EvalTo; + EvalTo evalToTmp(m_buffer, m_op); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::type, PacketAccess>::run(evalToTmp, m_device); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_device.deallocate(m_buffer); + m_buffer = NULL; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_buffer[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return internal::ploadt(m_buffer + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC typename MakePointer::Type data() const { return m_buffer; } + + /// required by sycl in order to extract the sycl accessor + const TensorEvaluator& impl() { return m_impl; } + /// used by sycl in order to build the sycl buffer + const Device& device() const{return m_device;} + private: + TensorEvaluator m_impl; + const ArgType m_op; + const Device& m_device; + typename MakePointer::Type m_buffer; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h new file mode 100644 index 00000000..6e76b5c5 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -0,0 +1,109 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H + +namespace Eigen { + +// MakePointer class is used as a container of the adress space of the pointer +// on the host and on the device. From the host side it generates the T* pointer +// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to +// T* m_data on the host. It is always called on the device. +// Specialisation of MakePointer class for creating the sycl buffer with +// map_allocator. +template struct MakePointer { + typedef T* Type; +}; + +template class MakePointer_ = MakePointer> class TensorMap; +template class Tensor; +template class TensorFixedSize; +template class TensorRef; +template class TensorBase; + +template class TensorCwiseNullaryOp; +template class TensorCwiseUnaryOp; +template class TensorCwiseBinaryOp; +template class TensorCwiseTernaryOp; +template class TensorSelectOp; +template class MakePointer_ = MakePointer > class TensorReductionOp; +template class TensorIndexTupleOp; +template class TensorTupleReducerOp; +template class TensorConcatenationOp; +template class TensorContractionOp; +template class TensorConversionOp; +template class TensorConvolutionOp; +template class TensorFFTOp; +template class TensorPatchOp; +template class TensorImagePatchOp; +template class TensorVolumePatchOp; +template class TensorBroadcastingOp; +template class TensorChippingOp; +template class TensorReshapingOp; +template class TensorLayoutSwapOp; +template class TensorSlicingOp; +template class TensorReverseOp; +template class TensorPaddingOp; +template class TensorShufflingOp; +template class TensorStridingOp; +template class TensorStridingSlicingOp; +template class TensorInflationOp; +template class TensorGeneratorOp; +template class TensorAssignOp; +template class TensorScanOp; + +template class TensorCustomUnaryOp; +template class TensorCustomBinaryOp; + +template class MakePointer_ = MakePointer> class TensorEvalToOp; +template class MakePointer_ = MakePointer> class TensorForcedEvalOp; + +template class TensorDevice; +template struct TensorEvaluator; + +struct DefaultDevice; +struct ThreadPoolDevice; +struct GpuDevice; +struct SyclDevice; + +enum FFTResultType { + RealPart = 0, + ImagPart = 1, + BothParts = 2 +}; + +enum FFTDirection { + FFT_FORWARD = 0, + FFT_REVERSE = 1 +}; + + +namespace internal { + +template +struct IsVectorizable { + static const bool value = TensorEvaluator::PacketAccess; +}; + +template +struct IsVectorizable { + static const bool value = TensorEvaluator::PacketAccess && + TensorEvaluator::IsAligned; +}; + +template ::value> +class TensorExecutor; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h new file mode 100644 index 00000000..4ed080d3 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -0,0 +1,489 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H + +namespace Eigen { +namespace internal { + + +/** \internal + * \brief Template functor to compute the modulo between an array and a scalar. + */ +template +struct scalar_mod_op { + EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {} + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; } + const Scalar m_divisor; +}; +template +struct functor_traits > +{ enum { Cost = scalar_div_cost::value, PacketAccess = false }; }; + + +/** \internal + * \brief Template functor to compute the modulo between 2 arrays. + */ +template +struct scalar_mod2_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op); + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } +}; +template +struct functor_traits > +{ enum { Cost = scalar_div_cost::value, PacketAccess = false }; }; + +template +struct scalar_fmod_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar + operator()(const Scalar& a, const Scalar& b) const { + return numext::fmod(a, b); + } +}; +template +struct functor_traits > { + enum { Cost = 13, // Reciprocal throughput of FPREM on Haswell. + PacketAccess = false }; +}; + + +/** \internal + * \brief Template functor to compute the sigmoid of a scalar + * \sa class CwiseUnaryOp, ArrayBase::sigmoid() + */ +template +struct scalar_sigmoid_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { + const T one = T(1); + return one / (one + numext::exp(-x)); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(const Packet& x) const { + const Packet one = pset1(T(1)); + return pdiv(one, padd(one, pexp(pnegate(x)))); + } +}; + +template +struct functor_traits > { + enum { + Cost = NumTraits::AddCost * 2 + NumTraits::MulCost * 6, + PacketAccess = packet_traits::HasAdd && packet_traits::HasDiv && + packet_traits::HasNegate && packet_traits::HasExp + }; +}; + + +template +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false + }; +}; + +// Standard reduction functors +template struct SumReducer +{ + static const bool PacketAccess = packet_traits::HasAdd; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + internal::scalar_sum_op sum_op; + *accum = sum_op(*accum, t); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = padd(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + internal::scalar_cast_op conv; + return conv(0); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + internal::scalar_sum_op sum_op; + return sum_op(saccum, predux(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasAdd + }; +}; + + +template struct MeanReducer +{ + static const bool PacketAccess = packet_traits::HasAdd && !NumTraits::IsInteger; + static const bool IsStateful = true; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + MeanReducer() : scalarCount_(0), packetCount_(0) { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { + internal::scalar_sum_op sum_op; + *accum = sum_op(*accum, t); + scalarCount_++; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) { + (*accum) = padd(*accum, p); + packetCount_++; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + internal::scalar_cast_op conv; + return conv(0); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum / scalarCount_; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return pdiv(vaccum, pset1(packetCount_)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + internal::scalar_sum_op sum_op; + return sum_op(saccum, predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits::size); + } + + protected: + DenseIndex scalarCount_; + DenseIndex packetCount_; +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasAdd + }; +}; + + +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { + return Eigen::NumTraits::lowest(); + } +}; +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { + return -Eigen::NumTraits::infinity(); + } +}; +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { + return Eigen::NumTraits::highest(); + } +}; +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { + return Eigen::NumTraits::infinity(); + } +}; + + +template struct MaxReducer +{ + static const bool PacketAccess = packet_traits::HasMax; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t > *accum) { *accum = t; } + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmax(*accum, p); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return MinMaxBottomValue::IsInteger>::bottom_value(); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return numext::maxi(saccum, predux_max(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasMax + }; +}; + + +template struct MinReducer +{ + static const bool PacketAccess = packet_traits::HasMin; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t < *accum) { *accum = t; } + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmin(*accum, p); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return MinMaxBottomValue::IsInteger>::bottom_value(); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return numext::mini(saccum, predux_min(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasMin + }; +}; + + +template struct ProdReducer +{ + static const bool PacketAccess = packet_traits::HasMul; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + internal::scalar_product_op prod_op; + (*accum) = prod_op(*accum, t); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmul(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + internal::scalar_cast_op conv; + return conv(1); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + internal::scalar_product_op prod_op; + return prod_op(saccum, predux_mul(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::MulCost, + PacketAccess = PacketType::HasMul + }; +}; + + +struct AndReducer +{ + static const bool PacketAccess = false; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { + *accum = *accum && t; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { + return accum; + } +}; + +template +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false + }; +}; + + +struct OrReducer { + static const bool PacketAccess = false; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { + *accum = *accum || t; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { + return false; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { + return accum; + } +}; + +template +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false + }; +}; + + +// Argmin/Argmax reducers +template struct ArgMaxTupleReducer +{ + static const bool PacketAccess = false; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t.second > accum->second) { *accum = t; } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return T(0, NumTraits::lowest()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { + return accum; + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = false + }; +}; + + +template struct ArgMinTupleReducer +{ + static const bool PacketAccess = false; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const { + if (t.second < accum->second) { *accum = t; } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return T(0, NumTraits::highest()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { + return accum; + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = false + }; +}; + + +template +class GaussianGenerator { + public: + static const bool PacketAccess = false; + + EIGEN_DEVICE_FUNC GaussianGenerator(const array& means, + const array& std_devs) + : m_means(means) + { + for (size_t i = 0; i < NumDims; ++i) { + m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2; + } + } + + EIGEN_DEVICE_FUNC T operator()(const array& coordinates) const { + T tmp = T(0); + for (size_t i = 0; i < NumDims; ++i) { + T offset = coordinates[i] - m_means[i]; + tmp += offset * offset / m_two_sigmas[i]; + } + return numext::exp(-tmp); + } + + private: + array m_means; + array m_two_sigmas; +}; + +template +struct functor_traits > { + enum { + Cost = NumDims * (2 * NumTraits::AddCost + NumTraits::MulCost + + functor_traits >::Cost) + + functor_traits >::Cost, + PacketAccess = GaussianGenerator::PacketAccess + }; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h new file mode 100644 index 00000000..008d113d --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -0,0 +1,185 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H + +namespace Eigen { + +/** \class TensorGeneratorOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor generator class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorGeneratorOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorGeneratorOp type; +}; + +} // end namespace internal + + + +template +class TensorGeneratorOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator) + : m_xpr(expr), m_generator(generator) {} + + EIGEN_DEVICE_FUNC + const Generator& generator() const { return m_generator; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Generator m_generator; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorGeneratorOp XprType; + typedef typename XprType::Index Index; + typedef typename TensorEvaluator::Dimensions Dimensions; + static const int NumDims = internal::array_size::value; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + enum { + IsAligned = false, + PacketAccess = (internal::unpacket_traits::size > 1), + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_generator(op.generator()) + { + TensorEvaluator impl(op.expression(), device); + m_dimensions = impl.dimensions(); + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + array coords; + extract_coordinates(index, coords); + return m_generator(coords); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool) const { + // TODO(rmlarsen): This is just a placeholder. Define interface to make + // generators return their cost. + return TensorOpCost(0, 0, TensorOpCost::AddCost() + + TensorOpCost::MulCost()); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void extract_coordinates(Index index, array& coords) const { + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_strides[i]; + index -= idx * m_strides[i]; + coords[i] = idx; + } + coords[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_strides[i]; + index -= idx * m_strides[i]; + coords[i] = idx; + } + coords[NumDims-1] = index; + } + } + + Dimensions m_dimensions; + array m_strides; + Generator m_generator; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h new file mode 100644 index 00000000..403d6904 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h @@ -0,0 +1,33 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Eugene Brevdo +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H + +namespace Eigen { + +/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors. + * + * This function computes the regularized incomplete beta function (integral). + * + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const + TensorCwiseTernaryOp, + const ADerived, const BDerived, const XDerived> + betainc(const ADerived& a, const BDerived& b, const XDerived& x) { + return TensorCwiseTernaryOp< + internal::scalar_betainc_op, const ADerived, + const BDerived, const XDerived>( + a, b, x, internal::scalar_betainc_op()); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h new file mode 100644 index 00000000..d3ae8bcb --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h @@ -0,0 +1,79 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H +#define EIGEN_CXX11_TENSOR_TENSOR_IO_H + +namespace Eigen { + +namespace internal { + +// Print the tensor as a 2d matrix +template +struct TensorPrinter { + static void run (std::ostream& os, const Tensor& tensor) { + typedef typename internal::remove_const::type Scalar; + typedef typename Tensor::Index Index; + const Index total_size = internal::array_prod(tensor.dimensions()); + if (total_size > 0) { + const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions()); + static const int layout = Tensor::Layout; + Map > matrix(const_cast(tensor.data()), first_dim, total_size/first_dim); + os << matrix; + } + } +}; + + +// Print the tensor as a vector +template +struct TensorPrinter { + static void run (std::ostream& os, const Tensor& tensor) { + typedef typename internal::remove_const::type Scalar; + typedef typename Tensor::Index Index; + const Index total_size = internal::array_prod(tensor.dimensions()); + if (total_size > 0) { + Map > array(const_cast(tensor.data()), total_size); + os << array; + } + } +}; + + +// Print the tensor as a scalar +template +struct TensorPrinter { + static void run (std::ostream& os, const Tensor& tensor) { + os << tensor.coeff(0); + } +}; +} + +template +std::ostream& operator << (std::ostream& os, const TensorBase& expr) { + typedef TensorEvaluator, DefaultDevice> Evaluator; + typedef typename Evaluator::Dimensions Dimensions; + + // Evaluate the expression if needed + TensorForcedEvalOp eval = expr.eval(); + Evaluator tensor(eval, DefaultDevice()); + tensor.evalSubExprsIfNeeded(NULL); + + // Print the result + static const int rank = internal::array_size::value; + internal::TensorPrinter::run(os, tensor); + + // Cleanup. + tensor.cleanup(); + return os; +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h new file mode 100644 index 00000000..7e25f4f2 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -0,0 +1,509 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H + +namespace Eigen { + +/** \class TensorImagePatch + * \ingroup CXX11_Tensor_Module + * + * \brief Patch extraction specialized for image processing. + * This assumes that the input has a least 3 dimensions ordered as follow: + * 1st dimension: channels (of size d) + * 2nd dimension: rows (of size r) + * 3rd dimension: columns (of size c) + * There can be additional dimensions such as time (for video) or batch (for + * bulk processing after the first 3. + * Calling the image patch code with patch_rows and patch_cols is equivalent + * to calling the regular patch extraction code with parameters d, patch_rows, + * patch_cols, and 1 for all the additional dimensions. + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename internal::remove_const::type Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorImagePatchOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorImagePatchOp type; +}; + +} // end namespace internal + +template +class TensorImagePatchOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex row_strides, DenseIndex col_strides, + DenseIndex in_row_strides, DenseIndex in_col_strides, + DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, + PaddingType padding_type, Scalar padding_value) + : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_row_strides(row_strides), m_col_strides(col_strides), + m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0), + m_padding_type(padding_type), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex row_strides, DenseIndex col_strides, + DenseIndex in_row_strides, DenseIndex in_col_strides, + DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, + DenseIndex padding_top, DenseIndex padding_bottom, + DenseIndex padding_left, DenseIndex padding_right, + Scalar padding_value) + : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_row_strides(row_strides), m_col_strides(col_strides), + m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom), + m_padding_left(padding_left), m_padding_right(padding_right), + m_padding_type(PADDING_VALID), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC + DenseIndex patch_rows() const { return m_patch_rows; } + EIGEN_DEVICE_FUNC + DenseIndex patch_cols() const { return m_patch_cols; } + EIGEN_DEVICE_FUNC + DenseIndex row_strides() const { return m_row_strides; } + EIGEN_DEVICE_FUNC + DenseIndex col_strides() const { return m_col_strides; } + EIGEN_DEVICE_FUNC + DenseIndex in_row_strides() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC + DenseIndex in_col_strides() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC + DenseIndex row_inflate_strides() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC + DenseIndex col_inflate_strides() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC + bool padding_explicit() const { return m_padding_explicit; } + EIGEN_DEVICE_FUNC + DenseIndex padding_top() const { return m_padding_top; } + EIGEN_DEVICE_FUNC + DenseIndex padding_bottom() const { return m_padding_bottom; } + EIGEN_DEVICE_FUNC + DenseIndex padding_left() const { return m_padding_left; } + EIGEN_DEVICE_FUNC + DenseIndex padding_right() const { return m_padding_right; } + EIGEN_DEVICE_FUNC + PaddingType padding_type() const { return m_padding_type; } + EIGEN_DEVICE_FUNC + Scalar padding_value() const { return m_padding_value; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const DenseIndex m_patch_rows; + const DenseIndex m_patch_cols; + const DenseIndex m_row_strides; + const DenseIndex m_col_strides; + const DenseIndex m_in_row_strides; + const DenseIndex m_in_col_strides; + const DenseIndex m_row_inflate_strides; + const DenseIndex m_col_inflate_strides; + const bool m_padding_explicit; + const DenseIndex m_padding_top; + const DenseIndex m_padding_bottom; + const DenseIndex m_padding_left; + const DenseIndex m_padding_right; + const PaddingType m_padding_type; + const Scalar m_padding_value; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorImagePatchOp XprType; + typedef typename XprType::Index Index; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumDims = NumInputDims + 1; + typedef DSizes Dimensions; + typedef typename internal::remove_const::type Scalar; + typedef TensorEvaluator, + Device> Self; + typedef TensorEvaluator Impl; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); + + m_paddingValue = op.padding_value(); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + + // Caches a few variables. + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputDepth = input_dims[0]; + m_inputRows = input_dims[1]; + m_inputCols = input_dims[2]; + } else { + m_inputDepth = input_dims[NumInputDims-1]; + m_inputRows = input_dims[NumInputDims-2]; + m_inputCols = input_dims[NumInputDims-3]; + } + + m_row_strides = op.row_strides(); + m_col_strides = op.col_strides(); + + // Input strides and effective input/patch size + m_in_row_strides = op.in_row_strides(); + m_in_col_strides = op.in_col_strides(); + m_row_inflate_strides = op.row_inflate_strides(); + m_col_inflate_strides = op.col_inflate_strides(); + // The "effective" input rows and input cols are the input rows and cols + // after inflating them with zeros. + // For examples, a 2x3 matrix with row_inflate_strides and + // col_inflate_strides of 2 comes from: + // A B C + // D E F + // + // to a matrix is 3 x 5: + // + // A . B . C + // . . . . . + // D . E . F + + m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1; + m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1; + m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1); + m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1); + + if (op.padding_explicit()) { + m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); + m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); + m_rowPaddingTop = op.padding_top(); + m_colPaddingLeft = op.padding_left(); + } else { + // Computing padding from the type + switch (op.padding_type()) { + case PADDING_VALID: + m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); + m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = numext::maxi(0, ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2); + m_colPaddingLeft = numext::maxi(0, ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2); + break; + case PADDING_SAME: + m_outputRows = numext::ceil(m_input_rows_eff / static_cast(m_row_strides)); + m_outputCols = numext::ceil(m_input_cols_eff / static_cast(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2; + m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2; + break; + default: + eigen_assert(false && "unexpected padding"); + } + } + eigen_assert(m_outputRows > 0); + eigen_assert(m_outputCols > 0); + + // Dimensions for result of extraction. + if (static_cast(Layout) == static_cast(ColMajor)) { + // ColMajor + // 0: depth + // 1: patch_rows + // 2: patch_cols + // 3: number of patches + // 4 and beyond: anything else (such as batch). + m_dimensions[0] = input_dims[0]; + m_dimensions[1] = op.patch_rows(); + m_dimensions[2] = op.patch_cols(); + m_dimensions[3] = m_outputRows * m_outputCols; + for (int i = 4; i < NumDims; ++i) { + m_dimensions[i] = input_dims[i-1]; + } + } else { + // RowMajor + // NumDims-1: depth + // NumDims-2: patch_rows + // NumDims-3: patch_cols + // NumDims-4: number of patches + // NumDims-5 and beyond: anything else (such as batch). + m_dimensions[NumDims-1] = input_dims[NumInputDims-1]; + m_dimensions[NumDims-2] = op.patch_rows(); + m_dimensions[NumDims-3] = op.patch_cols(); + m_dimensions[NumDims-4] = m_outputRows * m_outputCols; + for (int i = NumDims-5; i >= 0; --i) { + m_dimensions[i] = input_dims[i]; + } + } + + // Strides for moving the patch in various dimensions. + if (static_cast(Layout) == static_cast(ColMajor)) { + m_colStride = m_dimensions[1]; + m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0]; + m_otherStride = m_patchStride * m_dimensions[3]; + } else { + m_colStride = m_dimensions[NumDims-2]; + m_patchStride = m_colStride * m_dimensions[NumDims-3] * m_dimensions[NumDims-1]; + m_otherStride = m_patchStride * m_dimensions[NumDims-4]; + } + + // Strides for navigating through the input tensor. + m_rowInputStride = m_inputDepth; + m_colInputStride = m_inputDepth * m_inputRows; + m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols; + + // Fast representations of different variables. + m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); + m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); + m_fastColStride = internal::TensorIntDivisor(m_colStride); + m_fastInflateRowStride = internal::TensorIntDivisor(m_row_inflate_strides); + m_fastInflateColStride = internal::TensorIntDivisor(m_col_inflate_strides); + m_fastInputColsEff = internal::TensorIntDivisor(m_input_cols_eff); + + // Number of patches in the width dimension. + m_fastOutputRows = internal::TensorIntDivisor(m_outputRows); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[0]); + } else { + m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims-1]); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Patch index corresponding to the passed in index. + const Index patchIndex = index / m_fastPatchStride; + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth; + + // Other ways to index this element. + const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride; + const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; + + // Calculate col index in the input original tensor. + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; + const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0); + if (inputCol < 0 || inputCol >= m_input_cols_eff || + ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { + return Scalar(m_paddingValue); + } + + // Calculate row index in the original input tensor. + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; + const Index rowOffset = patchOffset - colOffset * m_colStride; + const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; + const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0); + if (inputRow < 0 || inputRow >= m_input_rows_eff || + ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { + return Scalar(m_paddingValue); + } + + const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + + const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) { + return packetWithPossibleZero(index); + } + + const Index indices[2] = {index, index + PacketSize - 1}; + const Index patchIndex = indices[0] / m_fastPatchStride; + if (patchIndex != indices[1] / m_fastPatchStride) { + return packetWithPossibleZero(index); + } + const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride; + eigen_assert(otherIndex == indices[1] / m_fastOtherStride); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth, + (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth}; + + const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; + eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); + + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; + + // Calculate col indices in the original input tensor. + const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - + m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; + if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputCols[0] == inputCols[1]) { + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; + const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + // Calculate col indices in the original input tensor. + const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - + m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; + + if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) { + // no padding + const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.template packet(inputIndex); + } + } + + return packetWithPossibleZero(index); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + const TensorEvaluator& impl() const { return m_impl; } + + Index rowPaddingTop() const { return m_rowPaddingTop; } + Index colPaddingLeft() const { return m_colPaddingLeft; } + Index outputRows() const { return m_outputRows; } + Index outputCols() const { return m_outputCols; } + Index userRowStride() const { return m_row_strides; } + Index userColStride() const { return m_col_strides; } + Index userInRowStride() const { return m_in_row_strides; } + Index userInColStride() const { return m_in_col_strides; } + Index rowInflateStride() const { return m_row_inflate_strides; } + Index colInflateStride() const { return m_col_inflate_strides; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // We conservatively estimate the cost for the code path where the computed + // index is inside the original image and + // TensorEvaluator::CoordAccess is false. + const double compute_cost = 3 * TensorOpCost::DivCost() + + 6 * TensorOpCost::MulCost() + + 8 * TensorOpCost::MulCost(); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const + { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Dimensions m_dimensions; + + Index m_otherStride; + Index m_patchStride; + Index m_colStride; + Index m_row_strides; + Index m_col_strides; + + Index m_in_row_strides; + Index m_in_col_strides; + Index m_row_inflate_strides; + Index m_col_inflate_strides; + + Index m_input_rows_eff; + Index m_input_cols_eff; + Index m_patch_rows_eff; + Index m_patch_cols_eff; + + internal::TensorIntDivisor m_fastOtherStride; + internal::TensorIntDivisor m_fastPatchStride; + internal::TensorIntDivisor m_fastColStride; + internal::TensorIntDivisor m_fastInflateRowStride; + internal::TensorIntDivisor m_fastInflateColStride; + internal::TensorIntDivisor m_fastInputColsEff; + + Index m_rowInputStride; + Index m_colInputStride; + Index m_patchInputStride; + + Index m_inputDepth; + Index m_inputRows; + Index m_inputCols; + + Index m_outputRows; + Index m_outputCols; + + Index m_rowPaddingTop; + Index m_colPaddingLeft; + + internal::TensorIntDivisor m_fastOutputRows; + internal::TensorIntDivisor m_fastOutputDepth; + + Scalar m_paddingValue; + + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h new file mode 100644 index 00000000..2e91471c --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -0,0 +1,725 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H +#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H + + +#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES + +#define EIGEN_HAS_INDEX_LIST + +namespace Eigen { + +/** \internal + * + * \class TensorIndexList + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode a set of Tensor dimensions/indices. + * + * The indices in the list can be known at compile time or at runtime. A mix + * of static and dynamic indices can also be provided if needed. The tensor + * code will attempt to take advantage of the indices that are known at + * compile time to optimize the code it generates. + * + * This functionality requires a c++11 compliant compiler. If your compiler + * is older you need to use arrays of indices instead. + * + * Several examples are provided in the cxx11_tensor_index_list.cpp file. + * + * \sa Tensor + */ + +template +struct type2index { + static const DenseIndex value = n; + EIGEN_DEVICE_FUNC constexpr operator DenseIndex() const { return n; } + EIGEN_DEVICE_FUNC void set(DenseIndex val) { + eigen_assert(val == n); + } +}; + +// This can be used with IndexPairList to get compile-time constant pairs, +// such as IndexPairList, type2indexpair<3,4>>(). +template +struct type2indexpair { + static const DenseIndex first = f; + static const DenseIndex second = s; + + constexpr EIGEN_DEVICE_FUNC operator IndexPair() const { + return IndexPair(f, s); + } + + EIGEN_DEVICE_FUNC void set(const IndexPair& val) { + eigen_assert(val.first == f); + eigen_assert(val.second == s); + } +}; + + +template struct NumTraits > +{ + typedef DenseIndex Real; + enum { + IsComplex = 0, + RequireInitialization = false, + ReadCost = 1, + AddCost = 1, + MulCost = 1 + }; + + EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; } + EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; } + EIGEN_DEVICE_FUNC static inline Real highest() { return n; } + EIGEN_DEVICE_FUNC static inline Real lowest() { return n; } +}; + +namespace internal { +template +EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) { + val = new_val; +} +template +EIGEN_DEVICE_FUNC void update_value(type2index& val, DenseIndex new_val) { + val.set(new_val); +} + +template +EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair new_val) { + val = new_val; +} +template +EIGEN_DEVICE_FUNC void update_value(type2indexpair& val, IndexPair new_val) { + val.set(new_val); +} + + +template +struct is_compile_time_constant { + static constexpr bool value = false; +}; + +template +struct is_compile_time_constant > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant& > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant& > { + static constexpr bool value = true; +}; + +template +struct is_compile_time_constant > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant& > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant& > { + static constexpr bool value = true; +}; + + +template +struct IndexTuple; + +template +struct IndexTuple { + EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() { } + EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) { } + + constexpr static int count = 1 + sizeof...(O); + T head; + IndexTuple others; + typedef T Head; + typedef IndexTuple Other; +}; + +template + struct IndexTuple { + EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() { } + EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) { } + + constexpr static int count = 1; + T head; + typedef T Head; +}; + + +template +struct IndexTupleExtractor; + +template +struct IndexTupleExtractor { + + typedef typename IndexTupleExtractor::ValType ValType; + + EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple& val) { + return IndexTupleExtractor::get_val(val.others); + } + + EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple& val) { + return IndexTupleExtractor::get_val(val.others); + } + template + EIGEN_DEVICE_FUNC static void set_val(IndexTuple& val, V& new_val) { + IndexTupleExtractor::set_val(val.others, new_val); + } + +}; + +template + struct IndexTupleExtractor<0, T, O...> { + + typedef T ValType; + + EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple& val) { + return val.head; + } + EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple& val) { + return val.head; + } + template + EIGEN_DEVICE_FUNC static void set_val(IndexTuple& val, V& new_val) { + val.head = new_val; + } +}; + + + +template +EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor::ValType& array_get(IndexTuple& tuple) { + return IndexTupleExtractor::get_val(tuple); +} +template +EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor::ValType& array_get(const IndexTuple& tuple) { + return IndexTupleExtractor::get_val(tuple); +} +template + struct array_size > { + static const size_t value = IndexTuple::count; +}; +template + struct array_size > { + static const size_t value = IndexTuple::count; +}; + + + + +template +struct tuple_coeff { + template + EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple& t) { + // return array_get(t) * (i == Idx) + tuple_coeff::get(i, t) * (i != Idx); + return (i == Idx ? array_get(t) : tuple_coeff::get(i, t)); + } + template + EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const ValueT& value) { + if (i == Idx) { + update_value(array_get(t), value); + } else { + tuple_coeff::set(i, t, value); + } + } + + template + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple& t) { + return ((i == Idx) & is_compile_time_constant::ValType>::value) || + tuple_coeff::value_known_statically(i, t); + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple& t) { + return is_compile_time_constant::ValType>::value && + tuple_coeff::values_up_to_known_statically(t); + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple& t) { + return is_compile_time_constant::ValType>::value && + is_compile_time_constant::ValType>::value && + array_get(t) > array_get(t) && + tuple_coeff::values_up_to_statically_known_to_increase(t); + } +}; + +template +struct tuple_coeff<0, ValueT> { + template + EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple& t) { + // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr + return array_get<0>(t)/* * (i == 0)*/; + } + template + EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const ValueT value) { + eigen_assert (i == 0); + update_value(array_get<0>(t), value); + } + template + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple&) { + return is_compile_time_constant::ValType>::value & (i == 0); + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple&) { + return is_compile_time_constant::ValType>::value; + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple&) { + return true; + } +}; +} // namespace internal + + + +template +struct IndexList : internal::IndexTuple { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const { + return internal::tuple_coeff >::value-1, DenseIndex>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const { + return internal::tuple_coeff >::value-1, DenseIndex>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) { + return internal::tuple_coeff >::value-1, DenseIndex>::set(i, *this, value); + } + + EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } + EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple(first, other...) { } + EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple() { } + + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { + return internal::tuple_coeff >::value-1, DenseIndex>::value_known_statically(i, *this); + } + EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const { + return internal::tuple_coeff >::value-1, DenseIndex>::values_up_to_known_statically(*this); + } + + EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const { + return internal::tuple_coeff >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this); + } +}; + + +template +constexpr IndexList make_index_list(FirstType val1, OtherTypes... other_vals) { + return IndexList(val1, other_vals...); +} + + +template +struct IndexPairList : internal::IndexTuple { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair operator[] (const DenseIndex i) const { + return internal::tuple_coeff >::value-1, IndexPair>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair value) { + return internal::tuple_coeff>::value-1, IndexPair >::set(i, *this, value); + } + + EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } + EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple() { } + + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { + return internal::tuple_coeff >::value-1, DenseIndex>::value_known_statically(i, *this); + } +}; + +namespace internal { + +template size_t array_prod(const IndexList& sizes) { + size_t result = 1; + for (int i = 0; i < array_size >::value; ++i) { + result *= sizes[i]; + } + return result; +} + +template struct array_size > { + static const size_t value = array_size >::value; +}; +template struct array_size > { + static const size_t value = array_size >::value; +}; + +template struct array_size > { + static const size_t value = std::tuple_size >::value; +}; +template struct array_size > { + static const size_t value = std::tuple_size >::value; +}; + +template EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList& a) { + return IndexTupleExtractor::get_val(a); +} +template EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(const IndexList& a) { + return IndexTupleExtractor::get_val(a); +} + +template +struct index_known_statically_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { + return false; + } +}; + +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) { + return IndexList().value_known_statically(i); + } +}; + +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) { + return IndexList().value_known_statically(i); + } +}; + + +template +struct all_indices_known_statically_impl { + static constexpr bool run() { + return false; + } +}; + +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return IndexList().all_values_known_statically(); + } +}; + +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return IndexList().all_values_known_statically(); + } +}; + + +template +struct indices_statically_known_to_increase_impl { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return false; + } +}; + +template + struct indices_statically_known_to_increase_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return Eigen::IndexList().values_statically_known_to_increase(); + } +}; + +template + struct indices_statically_known_to_increase_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return Eigen::IndexList().values_statically_known_to_increase(); + } +}; + + +template +struct index_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) == value); + } +}; + +template +struct index_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) == value); + } +}; + + +template +struct index_statically_ne_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_statically_ne_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) != value); + } +}; + +template +struct index_statically_ne_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) != value); + } +}; + + +template +struct index_statically_gt_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_statically_gt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) > value); + } +}; + +template +struct index_statically_gt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) > value); + } +}; + + + +template +struct index_statically_lt_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_statically_lt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) < value); + } +}; + +template +struct index_statically_lt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) < value); + } +}; + + + +template +struct index_pair_first_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_pair_first_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexPairList().value_known_statically(i) & + (IndexPairList().operator[](i).first == value); + } +}; + +template +struct index_pair_first_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexPairList().value_known_statically(i) & + (IndexPairList().operator[](i).first == value); + } +}; + + + +template +struct index_pair_second_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_pair_second_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexPairList().value_known_statically(i) & + (IndexPairList().operator[](i).second == value); + } +}; + +template +struct index_pair_second_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexPairList().value_known_statically(i) & + (IndexPairList().operator[](i).second == value); + } +}; + + +} // end namespace internal +} // end namespace Eigen + +#else + +namespace Eigen { +namespace internal { + +template +struct index_known_statically_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { + return false; + } +}; + +template +struct all_indices_known_statically_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { + return false; + } +}; + +template +struct indices_statically_known_to_increase_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { + return false; + } +}; + +template +struct index_statically_eq_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_statically_ne_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_statically_gt_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_statically_lt_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_pair_first_statically_eq_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template +struct index_pair_second_statically_eq_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + return false; + } +}; + + + +} // end namespace internal +} // end namespace Eigen + +#endif + + +namespace Eigen { +namespace internal { +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(DenseIndex i) { + return index_known_statically_impl::run(i); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() { + return all_indices_known_statically_impl::run(); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() { + return indices_statically_known_to_increase_impl::run(); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(DenseIndex i, DenseIndex value) { + return index_statically_eq_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(DenseIndex i, DenseIndex value) { + return index_statically_ne_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(DenseIndex i, DenseIndex value) { + return index_statically_gt_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i, DenseIndex value) { + return index_statically_lt_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) { + return index_pair_first_statically_eq_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) { + return index_pair_second_statically_eq_impl::run(i, value); +} + +} // end namespace internal +} // end namespace Eigen + + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h new file mode 100644 index 00000000..1f3d4426 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -0,0 +1,229 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Ke Yang +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H +#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H + +namespace Eigen { + +/** \class TensorInflation + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor inflation class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorInflationOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorInflationOp type; +}; + +} // end namespace internal + +template +class TensorInflationOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides) + : m_xpr(expr), m_strides(strides) {} + + EIGEN_DEVICE_FUNC + const Strides& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Strides m_strides; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorInflationOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_strides(op.strides()) + { + m_dimensions = m_impl.dimensions(); + // Expand each dimension to the inflated dimension. + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1; + } + + // Remember the strides for fast division. + for (int i = 0; i < NumDims; ++i) { + m_fastStrides[i] = internal::TensorIntDivisor(m_strides[i]); + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + } + } else { // RowMajor + m_outputStrides[NumDims-1] = 1; + m_inputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + // Computes the input index given the output index. Returns true if the output + // index doesn't fall into a hole. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const + { + eigen_assert(index < dimensions().TotalSize()); + *inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (idx != idx / m_fastStrides[i] * m_strides[i]) { + return false; + } + *inputIndex += idx / m_strides[i] * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (index != index / m_fastStrides[0] * m_strides[0]) { + return false; + } + *inputIndex += index / m_strides[0]; + return true; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (idx != idx / m_fastStrides[i] * m_strides[i]) { + return false; + } + *inputIndex += idx / m_strides[i] * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (index != index / m_fastStrides[NumDims-1] * m_strides[NumDims-1]) { + return false; + } + *inputIndex += index / m_strides[NumDims - 1]; + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index inputIndex = 0; + if (getInputIndex(index, &inputIndex)) { + return m_impl.coeff(inputIndex); + } else { + return Scalar(0); + } + } + + // TODO(yangke): optimize this function so that we can detect and produce + // all-zero packets + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (3 * TensorOpCost::DivCost() + + 3 * TensorOpCost::MulCost() + + 2 * TensorOpCost::AddCost()); + const double input_size = m_impl.dimensions().TotalSize(); + const double output_size = m_dimensions.TotalSize(); + if (output_size == 0) + return TensorOpCost(); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0, + compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + const Strides m_strides; + array, NumDims> m_fastStrides; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h new file mode 100644 index 00000000..8dbe15be --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h @@ -0,0 +1,82 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H +#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H + +#if EIGEN_HAS_VARIADIC_TEMPLATES + +#include + +namespace Eigen { + +/** \class TensorInitializer + * \ingroup CXX11_Tensor_Module + * + * \brief Helper template to initialize Tensors from std::initializer_lists. + */ +namespace internal { + +template +struct Initializer { + typedef std::initializer_list< + typename Initializer::InitList> InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + for (auto v : vals) { + (*indices)[traits::NumDimensions - N] = i++; + Initializer::run(tensor, indices, v); + } + } +}; + +template +struct Initializer { + typedef std::initializer_list::Scalar> InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + // There is likely a faster way to do that than iterating. + for (auto v : vals) { + (*indices)[traits::NumDimensions - 1] = i++; + tensor.coeffRef(*indices) = v; + } + } +}; + +template +struct Initializer { + typedef typename traits::Scalar InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>*, + const InitList& v) { + tensor.coeffRef(0) = v; + } +}; + + +template +void initialize_tensor(TensorEvaluator& tensor, + const typename Initializer::NumDimensions>::InitList& vals) { + Eigen::array::Index, traits::NumDimensions> indices; + Initializer::NumDimensions>::run(tensor, &indices, vals); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_HAS_VARIADIC_TEMPLATES + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h new file mode 100644 index 00000000..dc1b3b87 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -0,0 +1,253 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H +#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H + + +namespace Eigen { + +/** \internal + * + * \class TensorIntDiv + * \ingroup CXX11_Tensor_Module + * + * \brief Fast integer division by a constant. + * + * See the paper from Granlund and Montgomery for explanation. + * (at xxxp://dx.doi.org/10.1145/773473.178249) + * + * \sa Tensor + */ + +namespace internal { + +namespace { + + // Note: result is undefined if val == 0 + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + typename internal::enable_if::type count_leading_zeros(const T val) + { +#ifdef __CUDA_ARCH__ + return __clz(val); +#elif EIGEN_COMP_MSVC + unsigned long index; + _BitScanReverse(&index, val); + return 31 - index; +#else + EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); + return __builtin_clz(static_cast(val)); +#endif + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + typename internal::enable_if::type count_leading_zeros(const T val) + { +#ifdef __CUDA_ARCH__ + return __clzll(val); +#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 + unsigned long index; + _BitScanReverse64(&index, val); + return 63 - index; +#elif EIGEN_COMP_MSVC + // MSVC's _BitScanReverse64 is not available for 32bits builds. + unsigned int lo = (unsigned int)(val&0xffffffff); + unsigned int hi = (unsigned int)((val>>32)&0xffffffff); + int n; + if(hi==0) + n = 32 + count_leading_zeros(lo); + else + n = count_leading_zeros(hi); + return n; +#else + EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); + return __builtin_clzll(static_cast(val)); +#endif + } + + template + struct UnsignedTraits { + typedef typename conditional::type type; + }; + + template + struct DividerTraits { + typedef typename UnsignedTraits::type type; + static const int N = sizeof(T) * 8; + }; + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { +#if defined(__CUDA_ARCH__) + return __umulhi(a, b); +#else + return (static_cast(a) * b) >> 32; +#endif + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { +#if defined(__CUDA_ARCH__) + return __umul64hi(a, b); +#elif defined(__SIZEOF_INT128__) + __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); + return static_cast(v >> 64); +#else + return (TensorUInt128, uint64_t>(a) * TensorUInt128, uint64_t>(b)).upper(); +#endif + } + + template + struct DividerHelper { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) { + EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE); + return static_cast((static_cast(1) << (N+log_div)) / divider - (static_cast(1) << N) + 1); + } + }; + + template + struct DividerHelper<64, T> { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { +#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) + return static_cast((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); +#else + const uint64_t shift = 1ULL << log_div; + TensorUInt128 result = TensorUInt128 >(shift, 0) / TensorUInt128, uint64_t>(divider) + - TensorUInt128, static_val<0> >(1, 0) + + TensorUInt128, static_val<1> >(1); + return static_cast(result); +#endif + } + }; +} + + +template +struct TensorIntDivisor { + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { + multiplier = 0; + shift1 = 0; + shift2 = 0; + } + + // Must have 0 < divider < 2^31. This is relaxed to + // 0 < divider < 2^63 when using 64-bit indices on platforms that support + // the __uint128_t type. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) { + const int N = DividerTraits::N; + eigen_assert(static_cast::type>(divider) < NumTraits::highest()/2); + eigen_assert(divider > 0); + + // fast ln2 + const int leading_zeros = count_leading_zeros(static_cast(divider)); + int log_div = N - leading_zeros; + // if divider is a power of two then log_div is 1 more than it should be. + if ((static_cast::type>(1) << (log_div-1)) == static_cast::type>(divider)) + log_div--; + + multiplier = DividerHelper::computeMultiplier(log_div, divider); + shift1 = log_div > 1 ? 1 : log_div; + shift2 = log_div > 1 ? log_div-1 : 0; + } + + // Must have 0 <= numerator. On platforms that dont support the __uint128_t + // type numerator should also be less than 2^32-1. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { + eigen_assert(static_cast::type>(numerator) < NumTraits::highest()/2); + //eigen_assert(numerator >= 0); // this is implicitly asserted by the line above + + UnsignedType t1 = muluh(multiplier, numerator); + UnsignedType t = (static_cast(numerator) - t1) >> shift1; + return (t1 + t) >> shift2; + } + + private: + typedef typename DividerTraits::type UnsignedType; + UnsignedType multiplier; + int32_t shift1; + int32_t shift2; +}; + + +// Optimized version for signed 32 bit integers. +// Derived from Hacker's Delight. +// Only works for divisors strictly greater than one +template <> +class TensorIntDivisor { + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { + magic = 0; + shift = 0; + } + // Must have 2 <= divider + EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider) { + eigen_assert(divider >= 2); + calcMagic(divider); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { +#ifdef __CUDA_ARCH__ + return (__umulhi(magic, n) >> shift); +#else + uint64_t v = static_cast(magic) * static_cast(n); + return (static_cast(v >> 32) >> shift); +#endif + } + +private: + // Compute the magic numbers. See Hacker's Delight section 10 for an in + // depth explanation. + EIGEN_DEVICE_FUNC void calcMagic(int32_t d) { + const unsigned two31 = 0x80000000; // 2**31. + unsigned ad = d; + unsigned t = two31 + (ad >> 31); + unsigned anc = t - 1 - t%ad; // Absolute value of nc. + int p = 31; // Init. p. + unsigned q1 = two31/anc; // Init. q1 = 2**p/|nc|. + unsigned r1 = two31 - q1*anc; // Init. r1 = rem(2**p, |nc|). + unsigned q2 = two31/ad; // Init. q2 = 2**p/|d|. + unsigned r2 = two31 - q2*ad; // Init. r2 = rem(2**p, |d|). + unsigned delta = 0; + do { + p = p + 1; + q1 = 2*q1; // Update q1 = 2**p/|nc|. + r1 = 2*r1; // Update r1 = rem(2**p, |nc|). + if (r1 >= anc) { // (Must be an unsigned + q1 = q1 + 1; // comparison here). + r1 = r1 - anc;} + q2 = 2*q2; // Update q2 = 2**p/|d|. + r2 = 2*r2; // Update r2 = rem(2**p, |d|). + if (r2 >= ad) { // (Must be an unsigned + q2 = q2 + 1; // comparison here). + r2 = r2 - ad;} + delta = ad - r2; + } while (q1 < delta || (q1 == delta && r1 == 0)); + + magic = (unsigned)(q2 + 1); + shift = p - 32; + } + + uint32_t magic; + int32_t shift; +}; + + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor& divisor) { + return divisor.divide(numerator); +} + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h new file mode 100644 index 00000000..e874e94f --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -0,0 +1,209 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H + +namespace Eigen { + +/** \class TensorLayoutSwap + * \ingroup CXX11_Tensor_Module + * + * \brief Swap the layout from col-major to row-major, or row-major + * to col-major, and invert the order of the dimensions. + * + * Beware: the dimensions are reversed by this operation. If you want to + * preserve the ordering of the dimensions, you need to combine this + * operation with a shuffle. + * + * \example: + * Tensor input(2, 4); + * Tensor output = input.swap_layout(); + * eigen_assert(output.dimension(0) == 4); + * eigen_assert(output.dimension(1) == 2); + * + * array shuffle(1, 0); + * output = input.swap_layout().shuffle(shuffle); + * eigen_assert(output.dimension(0) == 2); + * eigen_assert(output.dimension(1) == 4); + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = (traits::Layout == ColMajor) ? RowMajor : ColMajor; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorLayoutSwapOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorLayoutSwapOp type; +}; + +} // end namespace internal + + + +template +class TensorLayoutSwapOp : public TensorBase, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorLayoutSwapOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + for(int i = 0; i < NumDims; ++i) { + m_dimensions[i] = m_impl.dimensions()[NumDims-1-i]; + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + return m_impl.evalSubExprsIfNeeded(data); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } + + const TensorEvaluator& impl() const { return m_impl; } + + protected: + TensorEvaluator m_impl; + Dimensions m_dimensions; +}; + + +// Eval as lvalue +template + struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorLayoutSwapOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, + CoordAccess = false // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(index); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + this->m_impl.template writePacket(index, x); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h new file mode 100644 index 00000000..5165bb61 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -0,0 +1,54 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H +#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H + + +/** use this macro in sfinae selection in templated functions + * + * template::value , int >::type = 0 + * > + * void foo(){} + * + * becomes => + * + * template::value ) + * > + * void foo(){} + */ + +// SFINAE requires variadic templates +#ifndef __CUDACC__ +#if EIGEN_HAS_VARIADIC_TEMPLATES + // SFINAE doesn't work for gcc <= 4.7 + #ifdef EIGEN_COMP_GNUC + #if EIGEN_GNUC_AT_LEAST(4,8) + #define EIGEN_HAS_SFINAE + #endif + #else + #define EIGEN_HAS_SFINAE + #endif +#endif +#endif + +#define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \ + typename internal::enable_if< ( __condition__ ) , int >::type = 0 + + +#if EIGEN_HAS_CONSTEXPR +#define EIGEN_CONSTEXPR constexpr +#else +#define EIGEN_CONSTEXPR +#endif + + +#endif diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h new file mode 100644 index 00000000..ee430ae4 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -0,0 +1,323 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H + +namespace Eigen { + +// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_) + +/** \class TensorMap + * \ingroup CXX11_Tensor_Module + * + * \brief A tensor expression mapping an existing array of data. + * + */ +/// `template class MakePointer_` is added to convert the host pointer to the device pointer. +/// It is added due to the fact that for our device compiler `T*` is not allowed. +/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`. +/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_` is `T*` . +/// Therefore, by adding the default value, we managed to convert the type and it does not break any +/// existing code as its default value is `T*`. +template class MakePointer_> class TensorMap : public TensorBase > +{ + public: + typedef TensorMap Self; + typedef typename PlainObjectType::Base Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::traits::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + /* typedef typename internal::conditional< + bool(internal::is_lvalue::value), + Scalar *, + const Scalar *>::type + PointerType;*/ + typedef typename MakePointer_::Type PointerType; + typedef PointerType PointerArgType; + + static const int Options = Options_; + + static const Index NumIndices = PlainObjectType::NumIndices; + typedef typename PlainObjectType::Dimensions Dimensions; + + enum { + IsAligned = ((int(Options_)&Aligned)==Aligned), + Layout = PlainObjectType::Layout, + CoordAccess = true, + RawAccess = true + }; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { + EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { + EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { + EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { + EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) + { } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor) + : m_data(tensor.data()), m_dimensions(tensor.dimensions()) + { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PointerType data() { return m_data; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const PointerType data() const { return m_data; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const + { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) + return m_data[0]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + { + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i1 + i0 * m_dimensions[1]; + return m_data[index]; + } else { + const Index index = i0 + i1 * m_dimensions[0]; + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); + return m_data[index]; + } + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) + return m_data[0]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) + { + static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + const std::size_t NumDims = sizeof...(otherIndices) + 2; + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i1 + i0 * m_dimensions[1]; + return m_data[index]; + } else { + const Index index = i0 + i1 * m_dimensions[0]; + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); + return m_data[index]; + } + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Self& operator=(const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + private: + typename MakePointer_::Type m_data; + Dimensions m_dimensions; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h new file mode 100644 index 00000000..35075f85 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -0,0 +1,218 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H +#define EIGEN_CXX11_TENSOR_TENSOR_META_H + +namespace Eigen { + +template struct Cond {}; + +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +const T1& choose(Cond, const T1& first, const T2&) { + return first; +} + +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +const T2& choose(Cond, const T1&, const T2& second) { + return second; +} + + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T divup(const X x, const Y y) { + return static_cast((x + y - 1) / y); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T divup(const T x, const T y) { + return static_cast((x + y - 1) / y); +} + +template struct max_n_1 { + static const size_t size = n; +}; +template <> struct max_n_1<0> { + static const size_t size = 1; +}; + + +// Default packet types +template +struct PacketType : internal::packet_traits { + typedef typename internal::packet_traits::type type; +}; + +// For CUDA packet types when using a GpuDevice +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16) +template <> +struct PacketType { + typedef half2 type; + static const int size = 2; + enum { + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasLog = 1, + HasLog1p = 0, + HasLog10 = 0, + HasPow = 1, + }; +}; +#endif + +#if defined(EIGEN_USE_SYCL) +template + struct PacketType { + typedef T type; + static const int size = 1; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasArg = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasBlend = 0 + }; +}; +#endif + + +// Tuple mimics std::pair but works on e.g. nvcc. +template struct Tuple { + public: + U first; + V second; + + typedef U first_type; + typedef V second_type; + + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Tuple() : first(), second() {} + + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Tuple(const U& f, const V& s) : first(f), second(s) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Tuple& operator= (const Tuple& rhs) { + if (&rhs == this) return *this; + first = rhs.first; + second = rhs.second; + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void swap(Tuple& rhs) { + using numext::swap; + swap(first, rhs.first); + swap(second, rhs.second); + } +}; + +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +bool operator==(const Tuple& x, const Tuple& y) { + return (x.first == y.first && x.second == y.second); +} + +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +bool operator!=(const Tuple& x, const Tuple& y) { + return !(x == y); +} + + +// Can't use std::pairs on cuda devices +template struct IndexPair { + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {} + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {} + + EIGEN_DEVICE_FUNC void set(IndexPair val) { + first = val.first; + second = val.second; + } + + Idx first; + Idx second; +}; + + +#ifdef EIGEN_HAS_SFINAE +namespace internal { + + template + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + array customIndices2Array(IndexType& idx, numeric_list) { + return { idx[Is]... }; + } + template + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + array customIndices2Array(IndexType&, numeric_list) { + return array(); + } + + /** Make an array (for index/dimensions) out of a custom index */ + template + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + array customIndices2Array(IndexType& idx) { + return customIndices2Array(idx, typename gen_numeric_list::type{}); + } + + + template + struct is_base_of + { + + typedef char (&yes)[1]; + typedef char (&no)[2]; + + template + struct Host + { + operator BB*() const; + operator DD*(); + }; + + template + static yes check(D*, T); + static no check(B*, int); + + static const bool value = sizeof(check(Host(), int())) == sizeof(yes); + }; + +} +#endif + + + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_META_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h new file mode 100644 index 00000000..09f06343 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -0,0 +1,888 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H +#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H + +namespace Eigen { + +/** \class TensorReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = array_size::value; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorReshapingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorReshapingOp type; +}; + +} // end namespace internal + + + +template +class TensorReshapingOp : public TensorBase, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC + const NewDimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const NewDimensions m_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorReshapingOp XprType; + typedef NewDimensions Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dimensions(op.dimensions()) + { + // The total size of the reshaped tensor must be equal to the total size + // of the input tensor. + eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + return m_impl.evalSubExprsIfNeeded(data); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast(m_impl.data()); } + + EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } + + protected: + TensorEvaluator m_impl; + NewDimensions m_dimensions; +}; + + +// Eval as lvalue +template + struct TensorEvaluator, Device> + : public TensorEvaluator, Device> + +{ + typedef TensorEvaluator, Device> Base; + typedef TensorReshapingOp XprType; + typedef NewDimensions Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(index); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + this->m_impl.template writePacket(index, x); + } +}; + + +/** \class TensorSlicing + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor slicing class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = array_size::value; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorSlicingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorSlicingOp type; +}; + +} // end namespace internal + + + +template +class TensorSlicingOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes) + : m_xpr(expr), m_indices(indices), m_sizes(sizes) {} + + EIGEN_DEVICE_FUNC + const StartIndices& startIndices() const { return m_indices; } + EIGEN_DEVICE_FUNC + const Sizes& sizes() const { return m_sizes; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + + protected: + typename XprType::Nested m_xpr; + const StartIndices m_indices; + const Sizes m_sizes; +}; + + +// Fixme: figure out the exact threshold +namespace { +template struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { } + EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > threshold_; } + + private: + Index threshold_; +}; + +// It is very expensive to start the memcpy kernel on GPU: we therefore only +// use it for large copies. +#ifdef EIGEN_USE_GPU +template struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { } + EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; } +}; +#endif +} + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorSlicingOp XprType; + static const int NumDims = internal::array_size::value; + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets and sizes. + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) + { + for (std::size_t i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const Sizes& output_dims = op.sizes(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + } + + // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + } + } else { + m_inputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + } + + // Don't initialize m_fastOutputStrides[NumDims-1] since it won't ever be accessed. + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + } + } + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + m_impl.evalSubExprsIfNeeded(NULL); + if (!NumTraits::type>::RequireInitialization && data && m_impl.data()) { + Index contiguous_values = 1; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + } else { + for (int i = NumDims-1; i >= 0; --i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + } + // Use memcpy if it's going to be faster than using the regular evaluation. + const MemcpyTriggerForSlicing trigger(m_device); + if (trigger(contiguous_values)) { + Scalar* src = (Scalar*)m_impl.data(); + for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { + Index offset = srcCoeff(i); + m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar)); + } + return false; + } + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < internal::array_prod(dimensions())); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[0]); + inputIndices[1] += (indices[1] + m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[NumDims-1]); + inputIndices[1] += (indices[1] + m_offsets[NumDims-1]); + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); + } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + Scalar* result = m_impl.data(); + if (result) { + Index offset = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i+1; j < NumDims; ++j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i-1; j >= 0; --j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + } + return result + offset; + } + return NULL; + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[NumDims-1]); + } + return inputIndex; + } + + array m_outputStrides; + array, NumDims> m_fastOutputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + const Device& m_device; + Dimensions m_dimensions; + const StartIndices m_offsets; +}; + + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorSlicingOp XprType; + static const int NumDims = internal::array_size::value; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + const int packetSize = internal::unpacket_traits::size; + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[0]); + inputIndices[1] += (indices[1] + this->m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]); + inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]); + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + this->m_impl.template writePacket(inputIndices[0], x); + } + else { + EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + internal::pstore(values, x); + this->m_impl.coeffRef(inputIndices[0]) = values[0]; + this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + for (int i = 1; i < packetSize-1; ++i) { + this->coeffRef(index+i) = values[i]; + } + } + } +}; + + + +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = array_size::value; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorStridingSlicingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorStridingSlicingOp type; +}; + +} // end namespace internal + + +template +class TensorStridingSlicingOp : public TensorBase > +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp( + const XprType& expr, const StartIndices& startIndices, + const StopIndices& stopIndices, const Strides& strides) + : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices), + m_strides(strides) {} + + EIGEN_DEVICE_FUNC + const StartIndices& startIndices() const { return m_startIndices; } + EIGEN_DEVICE_FUNC + const StartIndices& stopIndices() const { return m_stopIndices; } + EIGEN_DEVICE_FUNC + const StartIndices& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run( + assign, DefaultDevice()); + return *this; + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run( + assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const StartIndices m_startIndices; + const StopIndices m_stopIndices; + const Strides m_strides; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorStridingSlicingOp XprType; + static const int NumDims = internal::array_size::value; + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets and sizes. + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()) + { + // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero + DSizes startIndicesClamped, stopIndicesClamped; + for (size_t i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); + if(m_strides[i]>0){ + startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); + stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); + }else{ + /* implies m_strides[i]<0 by assert */ + startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); + stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); + } + m_startIndices[i] = startIndicesClamped[i]; + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + + // check for degenerate intervals and compute output tensor shape + bool degenerate = false;; + for(int i = 0; i < NumDims; i++){ + Index interval = stopIndicesClamped[i] - startIndicesClamped[i]; + if(interval == 0 || ((interval<0) != (m_strides[i]<0))){ + m_dimensions[i] = 0; + degenerate = true; + }else{ + m_dimensions[i] = interval / m_strides[i] + + (interval % m_strides[i] != 0 ? 1 : 0); + eigen_assert(m_dimensions[i] >= 0); + } + } + Strides output_dims = m_dimensions; + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = m_strides[0]; + m_offsets[0] = startIndicesClamped[0]; + Index previousDimProduct = 1; + for (int i = 1; i < NumDims; ++i) { + previousDimProduct *= input_dims[i-1]; + m_inputStrides[i] = previousDimProduct * m_strides[i]; + m_offsets[i] = startIndicesClamped[i] * previousDimProduct; + } + + // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash + m_fastOutputStrides[i] = internal::TensorIntDivisor(degenerate ? 1 : m_outputStrides[i]); + } + } else { + m_inputStrides[NumDims-1] = m_strides[NumDims-1]; + m_offsets[NumDims-1] = startIndicesClamped[NumDims-1]; + Index previousDimProduct = 1; + for (int i = NumDims - 2; i >= 0; --i) { + previousDimProduct *= input_dims[i+1]; + m_inputStrides[i] = previousDimProduct * m_strides[i]; + m_offsets[i] = startIndicesClamped[i] * previousDimProduct; + } + + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; + // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash + m_fastOutputStrides[i] = internal::TensorIntDivisor(degenerate ? 1 : m_outputStrides[i]); + } + } + m_block_total_size_max = numext::maxi(static_cast(1), + device.lastLevelCacheSize() / + sizeof(Scalar)); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type ScalarNonConst; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Strides Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + return NULL; + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i >= 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i] + m_offsets[i]; + index -= idx * m_outputStrides[i]; + } + } else { + for (int i = 0; i < NumDims; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i] + m_offsets[i]; + index -= idx * m_outputStrides[i]; + } + } + return inputIndex; + } + + static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { + return numext::maxi(min, numext::mini(max,value)); + } + + array m_outputStrides; + array, NumDims> m_fastOutputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + const Device& m_device; + DSizes m_startIndices; // clamped startIndices + DSizes m_dimensions; + DSizes m_offsets; // offset in a flattened shape + const Strides m_strides; + std::size_t m_block_total_size_max; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorStridingSlicingOp XprType; + static const int NumDims = internal::array_size::value; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = TensorEvaluator::CoordAccess, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type ScalarNonConst; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Strides Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h new file mode 100644 index 00000000..66a3b75e --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -0,0 +1,397 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H +#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H + +namespace Eigen { + +/** \class TensorPadding + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor padding class. + * At the moment only padding with a constant value is supported. + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorPaddingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorPaddingOp type; +}; + +} // end namespace internal + + + +template +class TensorPaddingOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value) + : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC + const PaddingDimensions& padding() const { return m_padding_dims; } + EIGEN_DEVICE_FUNC + Scalar padding_value() const { return m_padding_value; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PaddingDimensions m_padding_dims; + const Scalar m_padding_value; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorPaddingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = true, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = true, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()) + { + // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead + // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector + // of 1 element first and then pad. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + + // Compute dimensions + m_dimensions = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] += m_padding[i].first + m_padding[i].second; + } + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1]; + } else { + m_inputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1]; + } + m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(index < dimensions().TotalSize()); + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (isPaddingAtIndexForDim(idx, i)) { + return m_paddingValue; + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (isPaddingAtIndexForDim(index, 0)) { + return m_paddingValue; + } + inputIndex += (index - m_padding[0].first); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i+1]; + if (isPaddingAtIndexForDim(idx, i)) { + return m_paddingValue; + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i+1]; + } + if (isPaddingAtIndexForDim(index, NumDims-1)) { + return m_paddingValue; + } + inputIndex += (index - m_padding[NumDims-1].first); + } + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + if (static_cast(Layout) == static_cast(ColMajor)) { + return packetColMajor(index); + } + return packetRowMajor(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + TensorOpCost cost = m_impl.costPerCoeff(vectorized); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) + updateCostPerDimension(cost, i, i == 0); + } else { + for (int i = NumDims - 1; i >= 0; --i) + updateCostPerDimension(cost, i, i == NumDims - 1); + } + return cost; + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( + Index index, int dim_index) const { +#if defined(EIGEN_HAS_INDEX_LIST) + return (!internal::index_pair_first_statically_eq(dim_index, 0) && + index < m_padding[dim_index].first) || + (!internal::index_pair_second_statically_eq(dim_index, 0) && + index >= m_dimensions[dim_index] - m_padding[dim_index].second); +#else + return (index < m_padding[dim_index].first) || + (index >= m_dimensions[dim_index] - m_padding[dim_index].second); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero( + int dim_index) const { +#if defined(EIGEN_HAS_INDEX_LIST) + return internal::index_pair_first_statically_eq(dim_index, 0); +#else + EIGEN_UNUSED_VARIABLE(dim_index); + return false; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero( + int dim_index) const { +#if defined(EIGEN_HAS_INDEX_LIST) + return internal::index_pair_second_statically_eq(dim_index, 0); +#else + EIGEN_UNUSED_VARIABLE(dim_index); + return false; +#endif + } + + + void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const { + const double in = static_cast(m_impl.dimensions()[i]); + const double out = in + m_padding[i].first + m_padding[i].second; + if (out == 0) + return; + const double reduction = in / out; + cost *= reduction; + if (first) { + cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + + reduction * (1 * TensorOpCost::AddCost())); + } else { + cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + + reduction * (2 * TensorOpCost::MulCost() + + 1 * TensorOpCost::DivCost())); + } + } + + protected: + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + const Index initialIndex = index; + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index first = index; + const Index last = index + PacketSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; + const Index lastPaddedRight = m_outputStrides[i+1]; + + if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index last = index + PacketSize - 1; + const Index first = index; + const Index lastPaddedLeft = m_padding[0].first; + const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); + const Index lastPaddedRight = m_outputStrides[1]; + + if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[0].first); + return m_impl.template packet(inputIndex); + } + // Every other case + return packetWithPossibleZero(initialIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + const Index initialIndex = index; + Index inputIndex = 0; + + for (int i = 0; i < NumDims - 1; ++i) { + const Index first = index; + const Index last = index + PacketSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; + const Index lastPaddedRight = m_outputStrides[i]; + + if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i+1]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i+1]; + } + else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index last = index + PacketSize - 1; + const Index first = index; + const Index lastPaddedLeft = m_padding[NumDims-1].first; + const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); + const Index lastPaddedRight = m_outputStrides[NumDims-1]; + + if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[NumDims-1].first); + return m_impl.template packet(inputIndex); + } + // Every other case + return packetWithPossibleZero(initialIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const + { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + PaddingDimensions m_padding; + + Scalar m_paddingValue; +}; + + + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h new file mode 100644 index 00000000..cadb5fde --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -0,0 +1,269 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H + +namespace Eigen { + +/** \class TensorPatch + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor patch class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorPatchOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorPatchOp type; +}; + +} // end namespace internal + + + +template +class TensorPatchOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims) + : m_xpr(expr), m_patch_dims(patch_dims) {} + + EIGEN_DEVICE_FUNC + const PatchDim& patch_dims() const { return m_patch_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PatchDim m_patch_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorPatchOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value + 1; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + Index num_patches = 1; + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const PatchDim& patch_dims = op.patch_dims(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims-1; ++i) { + m_dimensions[i] = patch_dims[i]; + num_patches *= (input_dims[i] - patch_dims[i] + 1); + } + m_dimensions[NumDims-1] = num_patches; + + m_inputStrides[0] = 1; + m_patchStrides[0] = 1; + for (int i = 1; i < NumDims-1; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1); + } + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } else { + for (int i = 0; i < NumDims-1; ++i) { + m_dimensions[i+1] = patch_dims[i]; + num_patches *= (input_dims[i] - patch_dims[i] + 1); + } + m_dimensions[0] = num_patches; + + m_inputStrides[NumDims-2] = 1; + m_patchStrides[NumDims-2] = 1; + for (int i = NumDims-3; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_patchStrides[i] = m_patchStrides[i+1] * (input_dims[i+1] - patch_dims[i+1] + 1); + } + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims-2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; + // Find the location of the first element of the patch. + Index patchIndex = index / m_outputStrides[output_stride_index]; + // Find the offset of the element wrt the location of the first element. + Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index]; + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = patchOffset / m_outputStrides[i]; + patchOffset -= offsetIdx * m_outputStrides[i]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + } else { + for (int i = 0; i < NumDims - 2; ++i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = patchOffset / m_outputStrides[i+1]; + patchOffset -= offsetIdx * m_outputStrides[i+1]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + } + inputIndex += (patchIndex + patchOffset); + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; + Index indices[2] = {index, index + PacketSize - 1}; + Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index], + indices[1] / m_outputStrides[output_stride_index]}; + Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index], + indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]}; + + Index inputIndices[2] = {0, 0}; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], + patchIndices[1] / m_patchStrides[i]}; + patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; + patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; + + const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i], + patchOffsets[1] / m_outputStrides[i]}; + patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i]; + patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i]; + + inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; + inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; + } + } else { + for (int i = 0; i < NumDims - 2; ++i) { + const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], + patchIndices[1] / m_patchStrides[i]}; + patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; + patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; + + const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i+1], + patchOffsets[1] / m_outputStrides[i+1]}; + patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i+1]; + patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i+1]; + + inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; + inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; + } + } + inputIndices[0] += (patchIndices[0] + patchOffsets[0]); + inputIndices[1] += (patchIndices[1] + patchOffsets[1]); + + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < PacketSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (TensorOpCost::DivCost() + + TensorOpCost::MulCost() + + 2 * TensorOpCost::AddCost()); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + array m_patchStrides; + + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h new file mode 100644 index 00000000..68560d0a --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -0,0 +1,276 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H +#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H + +namespace Eigen { +namespace internal { + +namespace { + +EIGEN_DEVICE_FUNC uint64_t get_random_seed() { +#ifdef __CUDA_ARCH__ + // We don't support 3d kernels since we currently only use 1 and + // 2d kernels. + assert(threadIdx.z == 0); + return clock64() + + blockIdx.x * blockDim.x + threadIdx.x + + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); + +#elif defined _WIN32 + // Use the current time as a baseline. + SYSTEMTIME st; + GetSystemTime(&st); + int time = st.wSecond + 1000 * st.wMilliseconds; + // Mix in a random number to make sure that we get different seeds if + // we try to generate seeds faster than the clock resolution. + // We need 2 random values since the generator only generate 16 bits at + // a time (xxxps://msdn.microsoft.com/en-us/library/398ax69y.aspx) + int rnd1 = ::rand(); + int rnd2 = ::rand(); + uint64_t rnd = (rnd1 | rnd2 << 16) ^ time; + return rnd; + +#elif defined __APPLE__ + // Same approach as for win32, except that the random number generator + // is better (// xxxps://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random). + uint64_t rnd = ::random() ^ mach_absolute_time(); + return rnd; + +#else + // Augment the current time with pseudo random number generation + // to ensure that we get different seeds if we try to generate seeds + // faster than the clock resolution. + timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + uint64_t rnd = ::random() ^ ts.tv_nsec; + return rnd; +#endif +} + +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) { + // TODO: Unify with the implementation in the non blocking thread pool. + uint64_t current = *state; + // Update the internal state + *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; + // Generate the random output (using the PCG-XSH-RS scheme) + return static_cast((current ^ (current >> 22)) >> (22 + (current >> 61))); +} + +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) { + seed = seed ? seed : get_random_seed(); + return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; +} + +} // namespace + + +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +T RandomToTypeUniform(uint64_t* state) { + unsigned rnd = PCG_XSH_RS_generator(state); + return static_cast(rnd); +} + + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Eigen::half RandomToTypeUniform(uint64_t* state) { + Eigen::half result; + // Generate 10 random bits for the mantissa + unsigned rnd = PCG_XSH_RS_generator(state); + result.x = static_cast(rnd & 0x3ffu); + // Set the exponent + result.x |= (static_cast(15) << 10); + // Return the final result + return result - Eigen::half(1.0f); +} + + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float RandomToTypeUniform(uint64_t* state) { + typedef union { + uint32_t raw; + float fp; + } internal; + internal result; + // Generate 23 random bits for the mantissa mantissa + const unsigned rnd = PCG_XSH_RS_generator(state); + result.raw = rnd & 0x7fffffu; + // Set the exponent + result.raw |= (static_cast(127) << 23); + // Return the final result + return result.fp - 1.0f; +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double RandomToTypeUniform(uint64_t* state) { + typedef union { + uint64_t raw; + double dp; + } internal; + internal result; + result.raw = 0; + // Generate 52 random bits for the mantissa + // First generate the upper 20 bits + unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu; + // The generate the lower 32 bits + unsigned rnd2 = PCG_XSH_RS_generator(state); + result.raw = (static_cast(rnd1) << 32) | rnd2; + // Set the exponent + result.raw |= (static_cast(1023) << 52); + // Return the final result + return result.dp - 1.0; +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex RandomToTypeUniform >(uint64_t* state) { + return std::complex(RandomToTypeUniform(state), + RandomToTypeUniform(state)); +} +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex RandomToTypeUniform >(uint64_t* state) { + return std::complex(RandomToTypeUniform(state), + RandomToTypeUniform(state)); +} + +template class UniformRandomGenerator { + public: + static const bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( + uint64_t seed = 0) { + m_state = PCG_XSH_RS_state(seed); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( + const UniformRandomGenerator& other) { + m_state = other.m_state; + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T operator()(Index i) const { + uint64_t local_state = m_state + i; + T result = RandomToTypeUniform(&local_state); + m_state = local_state; + return result; + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(Index i) const { + const int packetSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX T values[packetSize]; + uint64_t local_state = m_state + i; + for (int j = 0; j < packetSize; ++j) { + values[j] = RandomToTypeUniform(&local_state); + } + m_state = local_state; + return internal::pload(values); + } + + private: + mutable uint64_t m_state; +}; + +template +struct functor_traits > { + enum { + // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)). + Cost = 12 * NumTraits::AddCost * + ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)), + PacketAccess = UniformRandomGenerator::PacketAccess + }; +}; + + + +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +T RandomToTypeNormal(uint64_t* state) { + // Use the ratio of uniform method to generate numbers following a normal + // distribution. See for example Numerical Recipes chapter 7.3.9 for the + // details. + T u, v, q; + do { + u = RandomToTypeUniform(state); + v = T(1.7156) * (RandomToTypeUniform(state) - T(0.5)); + const T x = u - T(0.449871); + const T y = numext::abs(v) + T(0.386595); + q = x*x + y * (T(0.196)*y - T(0.25472)*x); + } while (q > T(0.27597) && + (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u)); + + return v/u; +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex RandomToTypeNormal >(uint64_t* state) { + return std::complex(RandomToTypeNormal(state), + RandomToTypeNormal(state)); +} +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex RandomToTypeNormal >(uint64_t* state) { + return std::complex(RandomToTypeNormal(state), + RandomToTypeNormal(state)); +} + + +template class NormalRandomGenerator { + public: + static const bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { + m_state = PCG_XSH_RS_state(seed); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator( + const NormalRandomGenerator& other) { + m_state = other.m_state; + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T operator()(Index i) const { + uint64_t local_state = m_state + i; + T result = RandomToTypeNormal(&local_state); + m_state = local_state; + return result; + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(Index i) const { + const int packetSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX T values[packetSize]; + uint64_t local_state = m_state + i; + for (int j = 0; j < packetSize; ++j) { + values[j] = RandomToTypeNormal(&local_state); + } + m_state = local_state; + return internal::pload(values); + } + + private: + mutable uint64_t m_state; +}; + + +template +struct functor_traits > { + enum { + // On average, we need to generate about 3 random numbers + // 15 mul, 8 add, 1.5 logs + Cost = 3 * functor_traits >::Cost + + 15 * NumTraits::AddCost + 8 * NumTraits::AddCost + + 3 * functor_traits >::Cost / 2, + PacketAccess = NormalRandomGenerator::PacketAccess + }; +}; + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h new file mode 100644 index 00000000..33439138 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -0,0 +1,781 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H + +namespace Eigen { + +/** \class TensorReduction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reduction class. + * + */ + +namespace internal { + template class MakePointer_ > + struct traits > + : traits +{ + typedef traits XprTraits; + typedef typename XprTraits::Scalar Scalar; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + static const int NumDimensions = XprTraits::NumDimensions - array_size::value; + static const int Layout = XprTraits::Layout; + + template struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + }; +}; + +template class MakePointer_> +struct eval, Eigen::Dense> +{ + typedef const TensorReductionOp& type; +}; + +template class MakePointer_> +struct nested, 1, typename eval >::type> +{ + typedef TensorReductionOp type; +}; + + +template struct DimInitializer { + template EIGEN_DEVICE_FUNC + static void run(const InputDims& input_dims, + const array::value>& reduced, + OutputDims* output_dims, ReducedDims* reduced_dims) { + const int NumInputDims = internal::array_size::value; + int outputIndex = 0; + int reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (reduced[i]) { + (*reduced_dims)[reduceIndex] = input_dims[i]; + ++reduceIndex; + } else { + (*output_dims)[outputIndex] = input_dims[i]; + ++outputIndex; + } + } + } +}; + +template <> struct DimInitializer > { + template EIGEN_DEVICE_FUNC + static void run(const InputDims& input_dims, const array&, + Sizes<>*, array* reduced_dims) { + const int NumInputDims = internal::array_size::value; + for (int i = 0; i < NumInputDims; ++i) { + (*reduced_dims)[i] = input_dims[i]; + } + } +}; + + +template +struct are_inner_most_dims { + static const bool value = false; +}; +template +struct preserve_inner_most_dims { + static const bool value = false; +}; + +#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES +template +struct are_inner_most_dims{ + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_eq(0, 0); + static const bool tmp3 = index_statically_eq(array_size::value-1, array_size::value-1); + static const bool value = tmp1 & tmp2 & tmp3; +}; +template +struct are_inner_most_dims{ + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_eq(0, NumTensorDims - array_size::value); + static const bool tmp3 = index_statically_eq(array_size::value - 1, NumTensorDims - 1); + static const bool value = tmp1 & tmp2 & tmp3; + +}; +template +struct preserve_inner_most_dims{ + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_gt(0, 0); + static const bool value = tmp1 & tmp2; + +}; +template +struct preserve_inner_most_dims{ + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_lt(array_size::value - 1, NumTensorDims - 1); + static const bool value = tmp1 & tmp2; +}; +#endif + + +template +struct GenericDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { + EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + GenericDimReducer::reduce(self, input, reducer, accum); + } + } +}; +template +struct GenericDimReducer<0, Self, Op> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { + for (int j = 0; j < self.m_reducedDims[0]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; + reducer.reduce(self.m_impl.coeff(input), accum); + } + } +}; +template +struct GenericDimReducer<-1, Self, Op> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) { + reducer.reduce(self.m_impl.coeff(index), accum); + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + typename Self::CoeffReturnType accum = reducer.initialize(); + for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalize(accum); + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + const int packetSize = internal::unpacket_traits::size; + const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; + typename Self::PacketReturnType p = reducer.template initializePacket(); + for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { + reducer.reducePacket(self.m_impl.template packet(firstIndex + j), &p); + } + typename Self::CoeffReturnType accum = reducer.initialize(); + for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalizeBoth(accum, p); + } +}; + +template +struct InnerMostDimPreserver { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { + eigen_assert(false && "should never be called"); + } +}; + +template +struct InnerMostDimPreserver { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + InnerMostDimPreserver::reduce(self, input, reducer, accum); + } + } +}; + +template +struct InnerMostDimPreserver<0, Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; + reducer.reducePacket(self.m_impl.template packet(input), accum); + } + } +}; +template +struct InnerMostDimPreserver<-1, Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { + eigen_assert(false && "should never be called"); + } +}; + +// Default full reducer +template +struct FullReducer { + static const bool HasOptimizedImplementation = false; + + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) { + const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); + *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); + } +}; + + +#ifdef EIGEN_USE_THREADS +// Multithreaded full reducers +template +struct FullReducerShard { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, + typename Self::Index numValuesToReduce, Op& reducer, + typename Self::CoeffReturnType* output) { + *output = InnerMostDimReducer::reduce( + self, firstIndex, numValuesToReduce, reducer); + } +}; + +// Multithreaded full reducer +template +struct FullReducer { + static const bool HasOptimizedImplementation = !Op::IsStateful; + static const int PacketSize = + unpacket_traits::size; + + // launch one reducer per thread and accumulate the result. + static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, + typename Self::CoeffReturnType* output) { + typedef typename Self::Index Index; + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + if (num_coeffs == 0) { + *output = reducer.finalize(reducer.initialize()); + return; + } + const TensorOpCost cost = + self.m_impl.costPerCoeff(Vectorizable) + + TensorOpCost(0, 0, internal::functor_traits::Cost, Vectorizable, + PacketSize); + const int num_threads = TensorCostModel::numThreads( + num_coeffs, cost, device.numThreads()); + if (num_threads == 1) { + *output = + InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); + return; + } + const Index blocksize = + std::floor(static_cast(num_coeffs) / num_threads); + const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + eigen_assert(num_coeffs >= numblocks * blocksize); + + Barrier barrier(internal::convert_index(numblocks)); + MaxSizeVector shards(numblocks, reducer.initialize()); + for (Index i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier(&barrier, &FullReducerShard::run, + self, i * blocksize, blocksize, reducer, + &shards[i]); + } + typename Self::CoeffReturnType finalShard; + if (numblocks * blocksize < num_coeffs) { + finalShard = InnerMostDimReducer::reduce( + self, numblocks * blocksize, num_coeffs - numblocks * blocksize, + reducer); + } else { + finalShard = reducer.initialize(); + } + barrier.Wait(); + + for (Index i = 0; i < numblocks; ++i) { + reducer.reduce(shards[i], &finalShard); + } + *output = reducer.finalize(finalShard); + } +}; + +#endif + + +// Default inner reducer +template +struct InnerReducer { + static const bool HasOptimizedImplementation = false; + + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; + +// Default outer reducer +template +struct OuterReducer { + static const bool HasOptimizedImplementation = false; + + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; + + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +template +__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); + + +#ifdef EIGEN_HAS_CUDA_FP16 +template +__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); +template +__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*); +template +__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*); + +#endif + +template +__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + +template +__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); +#endif + +} // end namespace internal + + +template class MakePointer_> +class TensorReductionOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims) + { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const XprType& expression() const { return m_expr; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dims& dims() const { return m_dims; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Op& reducer() const { return m_reducer; } + + protected: + typename XprType::Nested m_expr; + const Dims m_dims; + const Op m_reducer; +}; + + +// Eval as rvalue +template class MakePointer_, typename Device> +struct TensorEvaluator, Device> +{ + typedef TensorReductionOp XprType; + typedef typename XprType::Index Index; + typedef ArgType ChildType; + typedef typename TensorEvaluator::Dimensions InputDimensions; + static const int NumInputDims = internal::array_size::value; + static const int NumReducedDims = internal::array_size::value; + static const int NumOutputDims = NumInputDims - NumReducedDims; + typedef typename internal::conditional, DSizes >::type Dimensions; + typedef typename XprType::Scalar Scalar; + typedef TensorEvaluator, Device> Self; + static const bool InputPacketAccess = TensorEvaluator::PacketAccess; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = false, + PacketAccess = Self::InputPacketAccess && Op::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; + static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; + static const bool RunningFullReduction = (NumOutputDims==0); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims()) + { + EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + // Build the bitmap indicating if an input dimension is reduced or not. + for (int i = 0; i < NumInputDims; ++i) { + m_reduced[i] = false; + } + for (int i = 0; i < NumReducedDims; ++i) { + eigen_assert(op.dims()[i] >= 0); + eigen_assert(op.dims()[i] < NumInputDims); + m_reduced[op.dims()[i]] = true; + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + internal::DimInitializer::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims); + + // Precompute output strides. + if (NumOutputDims > 0) { + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_outputStrides.back() = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + } + } + + // Precompute input strides. + if (NumInputDims > 0) { + array input_strides; + if (static_cast(Layout) == static_cast(ColMajor)) { + input_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_strides[i] = input_strides[i-1] * input_dims[i-1]; + } + } else { + input_strides.back() = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + } + } + + int outputIndex = 0; + int reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + m_reducedStrides[reduceIndex] = input_strides[i]; + ++reduceIndex; + } else { + m_preservedStrides[outputIndex] = input_strides[i]; + ++outputIndex; + } + } + } + + // Special case for full reductions + if (NumOutputDims == 0) { + m_preservedStrides[0] = internal::array_prod(input_dims); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_::Type data) { + m_impl.evalSubExprsIfNeeded(NULL); + + // Use the FullReducer if possible. + if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction && + internal::FullReducer::HasOptimizedImplementation && + ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || + !RunningOnGPU))) { + bool need_assign = false; + if (!data) { + m_result = static_cast(m_device.allocate(sizeof(CoeffReturnType))); + data = m_result; + need_assign = true; + } + Op reducer(m_reducer); + internal::FullReducer::run(*this, reducer, m_device, data); + return need_assign; + } + else if(RunningOnSycl){ + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + m_result = data; + } + Op reducer(m_reducer); + internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + return (m_result != NULL); + } + + // Attempt to use an optimized reduction. + else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) { + bool reducing_inner_dims = true; + for (int i = 0; i < NumReducedDims; ++i) { + if (static_cast(Layout) == static_cast(ColMajor)) { + reducing_inner_dims &= m_reduced[i]; + } else { + reducing_inner_dims &= m_reduced[NumInputDims - 1 - i]; + } + } + if (internal::InnerReducer::HasOptimizedImplementation && + (reducing_inner_dims || ReducingInnerMostDims)) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) { + data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + m_result = data; + } + else { + return true; + } + } + Op reducer(m_reducer); + if (internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { + if (m_result) { + m_device.deallocate(m_result); + m_result = NULL; + } + return true; + } else { + return (m_result != NULL); + } + } + + bool preserving_inner_dims = true; + for (int i = 0; i < NumReducedDims; ++i) { + if (static_cast(Layout) == static_cast(ColMajor)) { + preserving_inner_dims &= m_reduced[NumInputDims - 1 - i]; + } else { + preserving_inner_dims &= m_reduced[i]; + } + } + if (internal::OuterReducer::HasOptimizedImplementation && + preserving_inner_dims) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) { + data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + m_result = data; + } + else { + return true; + } + } + Op reducer(m_reducer); + if (internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { + if (m_result) { + m_device.deallocate(m_result); + m_result = NULL; + } + return true; + } else { + return (m_result != NULL); + } + } + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + if (m_result) { + m_device.deallocate(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) { + return *(m_result + index); + } + Op reducer(m_reducer); + if (ReducingInnerMostDims || RunningFullReduction) { + const Index num_values_to_reduce = + (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; + return internal::InnerMostDimReducer::reduce(*this, firstInput(index), + num_values_to_reduce, reducer); + } else { + typename Self::CoeffReturnType accum = reducer.initialize(); + internal::GenericDimReducer::reduce(*this, firstInput(index), reducer, &accum); + return reducer.finalize(accum); + } + } + + // TODO(bsteiner): provide a more efficient implementation. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions()))); + + if (RunningOnGPU && m_result) { + return internal::pload(m_result + index); + } + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + if (ReducingInnerMostDims) { + const Index num_values_to_reduce = + (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; + const Index firstIndex = firstInput(index); + for (Index i = 0; i < PacketSize; ++i) { + Op reducer(m_reducer); + values[i] = internal::InnerMostDimReducer::reduce(*this, firstIndex + i * num_values_to_reduce, + num_values_to_reduce, reducer); + } + } else if (PreservingInnerMostDims) { + const Index firstIndex = firstInput(index); + const int innermost_dim = (static_cast(Layout) == static_cast(ColMajor)) ? 0 : NumOutputDims - 1; + // TBD: extend this the the n innermost dimensions that we preserve. + if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) { + Op reducer(m_reducer); + typename Self::PacketReturnType accum = reducer.template initializePacket(); + internal::InnerMostDimPreserver::reduce(*this, firstIndex, reducer, &accum); + return reducer.finalizePacket(accum); + } else { + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + } + } else { + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + // Must be called after evalSubExprsIfNeeded(). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + if (RunningFullReduction && m_result) { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } else { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const double compute_cost = num_values_to_reduce * internal::functor_traits::Cost; + return m_impl.costPerCoeff(vectorized) * num_values_to_reduce + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + } + + EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& impl() const { return m_impl; } + /// added for sycl in order to construct the buffer from the sycl device + const Device& device() const{return m_device;} + /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel + const Dims& xprDims() const {return m_xpr_dims;} + + + private: + template friend struct internal::GenericDimReducer; + template friend struct internal::InnerMostDimReducer; + template friend struct internal::InnerMostDimPreserver; + template friend struct internal::FullReducer; +#ifdef EIGEN_USE_THREADS + template friend struct internal::FullReducerShard; +#endif +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); +#ifdef EIGEN_HAS_CUDA_FP16 + template friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); + template friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); + template friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); +#endif + template friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + + template friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); +#endif + + template friend struct internal::InnerReducer; + + // Returns the Index in the input tensor of the first value that needs to be + // used to compute the reduction at output index "index". + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + if (ReducingInnerMostDims) { + if (static_cast(Layout) == static_cast(ColMajor)) { + return index * m_preservedStrides[0]; + } else { + return index * m_preservedStrides[NumPreservedStrides - 1]; + } + } + // TBD: optimize the case where we preserve the innermost dimensions. + Index startInput = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + // This is index_i in the output tensor. + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (PreservingInnerMostDims) { + eigen_assert(m_preservedStrides[0] == 1); + startInput += index; + } else { + startInput += index * m_preservedStrides[0]; + } + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + // This is index_i in the output tensor. + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (PreservingInnerMostDims) { + eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1); + startInput += index; + } else { + startInput += index * m_preservedStrides[NumPreservedStrides - 1]; + } + } + return startInput; + } + + // Bitmap indicating if an input dimension is reduced or not. + array m_reduced; + // Dimensions of the output of the operation. + Dimensions m_dimensions; + // Precomputed strides for the output tensor. + array m_outputStrides; + // Subset of strides of the input tensor for the non-reduced dimensions. + // Indexed by output dimensions. + static const int NumPreservedStrides = max_n_1::size; + array m_preservedStrides; + + // Subset of strides of the input tensor for the reduced dimensions. + // Indexed by reduced dimensions. + array m_reducedStrides; + // Size of the input dimensions that are reduced. + // Indexed by reduced dimensions. + array m_reducedDims; + + // Evaluator for the input expression. + TensorEvaluator m_impl; + + // Operation to apply for computing the reduction. + Op m_reducer; + + // For full reductions +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + static const bool RunningOnGPU = internal::is_same::value; + static const bool RunningOnSycl = false; +#elif defined(EIGEN_USE_SYCL) +static const bool RunningOnSycl = internal::is_same::type, Eigen::SyclDevice>::value; +static const bool RunningOnGPU = false; +#else + static const bool RunningOnGPU = false; + static const bool RunningOnSycl = false; +#endif + typename MakePointer_::Type m_result; + + const Device& m_device; + const Dims& m_xpr_dims; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H diff --git a/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h new file mode 100644 index 00000000..90f43793 --- /dev/null +++ b/eigenlib/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -0,0 +1,750 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at the mozilla.org home page + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H + +namespace Eigen { +namespace internal { + + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +// Full reducers for GPU, don't vectorize for now + +// Reducer function that enables multiple cuda thread to safely accumulate at the same +// output address. It basically reads the current value of the output variable, and +// attempts to update it with the new value. If in the meantime another cuda thread +// updated the content of the output address it will try again. +template +__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { +#if __CUDA_ARCH__ >= 300 + if (sizeof(T) == 4) + { + unsigned int oldval = *reinterpret_cast(output); + unsigned int newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } + else if (sizeof(T) == 8) { + unsigned long long oldval = *reinterpret_cast(output); + unsigned long long newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned long long readback; + while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } + else { + assert(0 && "Wordsize not supported"); + } +#else + assert(0 && "Shouldn't be called on unsupported device"); +#endif +} + +// We extend atomicExch to support extra data types +template +__device__ inline Type atomicExchCustom(Type* address, Type val) { + return atomicExch(address, val); +} + +template <> +__device__ inline double atomicExchCustom(double* address, double val) { + unsigned long long int* address_as_ull = reinterpret_cast(address); + return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); +} + +#ifdef EIGEN_HAS_CUDA_FP16 +template