ReducedModelOptimization/src/gradientDescent.h

/* gdcpp.h
 *
 *     Author: Fabian Meyer
 * Created On: 12 Jul 2019
 *    License: MIT
 */

#ifndef GDCPP_GDCPP_H_
#define GDCPP_GDCPP_H_

#include <Eigen/Geometry>
#include <functional>
#include <iomanip>
#include <iostream>
#include <limits>

namespace gdc {
typedef long int Index;

/** Functor to compute forward differences.
 * Computes the gradient of the objective f(x) as follows:
 *
 * grad(x) = (f(x + eps) - f(x)) / eps
 *
 * The computation requires len(x) evaluations of the objective.
 */
template <typename Scalar> class ForwardDifferences {
public:
  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, 1> Vector;
  typedef std::function<Scalar(const Vector &)> Objective;

private:
  Scalar eps_;
  Index threads_;
  Objective objective_;

public:
  ForwardDifferences()
      : ForwardDifferences(std::sqrt(std::numeric_limits<Scalar>::epsilon())) {}

  ForwardDifferences(const Scalar eps) : eps_(eps), threads_(1), objective_() {}

  void setNumericalEpsilon(const Scalar eps) { eps_ = eps; }

  void setThreads(const Index threads) { threads_ = threads; }

  void setObjective(const Objective &objective) { objective_ = objective; }

  void operator()(const Vector &xval, const Scalar fval, Vector &gradient) {
    assert(objective_);

    gradient.resize(xval.size());
#pragma omp parallel for num_threads(threads_)
    for (Index i = 0; i < xval.size(); ++i) {
      Vector xvalN = xval;
      xvalN(i) += eps_;
      Scalar fvalN = objective_(xvalN);

      gradient(i) = (fvalN - fval) / eps_;
    }
  }
};

/** Functor to compute backward differences.
 * Computes the gradient of the objective f(x) as follows:
 *
 * grad(x) = (f(x) - f(x - eps)) / eps
 *
 * The computation requires len(x) evaluations of the objective.
 */
template <typename Scalar> class BackwardDifferences {
public:
  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, 1> Vector;
  typedef std::function<Scalar(const Vector &)> Objective;

private:
  Scalar eps_;
  Index threads_;
  Objective objective_;

public:
  BackwardDifferences()
      : BackwardDifferences(std::sqrt(std::numeric_limits<Scalar>::epsilon())) {
  }

  BackwardDifferences(const Scalar eps)
      : eps_(eps), threads_(1), objective_() {}

  void setNumericalEpsilon(const Scalar eps) { eps_ = eps; }

  void setThreads(const Index threads) { threads_ = threads; }

  void setObjective(const Objective &objective) { objective_ = objective; }

  void operator()(const Vector &xval, const Scalar fval, Vector &gradient) {
    assert(objective_);

    gradient.resize(xval.size());
#pragma omp parallel for num_threads(threads_)
    for (Index i = 0; i < xval.size(); ++i) {
      Vector xvalN = xval;
      xvalN(i) -= eps_;
      Scalar fvalN = objective_(xvalN);
      gradient(i) = (fval - fvalN) / eps_;
    }
  }
};

/** Functor to compute central differences.
 * Computes the gradient of the objective f(x) as follows:
 *
 * grad(x) = (f(x + 0.5 eps) - f(x - 0.5 eps)) / eps
 *
 * The computation requires 2 * len(x) evaluations of the objective.
 */
template <typename Scalar> struct CentralDifferences {
public:
  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, 1> Vector;
  typedef std::function<Scalar(const Vector &)> Objective;

private:
  Scalar eps_;
  Index threads_;
  Objective objective_;

public:
  CentralDifferences()
      : CentralDifferences(std::sqrt(std::numeric_limits<Scalar>::epsilon())) {}

  CentralDifferences(const Scalar eps) : eps_(eps), threads_(1), objective_() {}

  void setNumericalEpsilon(const Scalar eps) { eps_ = eps; }

  void setThreads(const Index threads) { threads_ = threads; }

  void setObjective(const Objective &objective) { objective_ = objective; }

  void operator()(const Vector &xval, const Scalar, Vector &gradient) {
    assert(objective_);

    Vector fvals(xval.size() * 2);
#pragma omp parallel for num_threads(threads_)
    for (Index i = 0; i < fvals.size(); ++i) {
      Index idx = i / 2;
      Vector xvalN = xval;
      if (i % 2 == 0)
        xvalN(idx) += eps_ / 2;
      else
        xvalN(idx) -= eps_ / 2;

      fvals(i) = objective_(xvalN);
    }

    gradient.resize(xval.size());
    for (Index i = 0; i < xval.size(); ++i)
      gradient(i) = (fvals(i * 2) - fvals(i * 2 + 1)) / eps_;
  }
};

/** Dummy callback functor, which does nothing. */
template <typename Scalar> struct NoCallback {
  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, 1> Vector;

  bool operator()(const Index, const Vector &, const Scalar,
                  const Vector &) const {
    return true;
  }
};

/** Step size functor, which returns a constant step size. */
template <typename Scalar> class ConstantStepSize {
public:
  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, 1> Vector;
  typedef std::function<Scalar(const Vector &, Vector &)> Objective;
  typedef std::function<void(const Vector &, const Scalar, Vector &)>
      FiniteDifferences;

private:
  Scalar stepSize_;

public:
  ConstantStepSize() : ConstantStepSize(0.000000000000001) {}

  ConstantStepSize(const Scalar stepSize) : stepSize_(stepSize) {}

  /** Set the step size returned by this functor.
   * @param stepSize step size returned by functor */
  void setStepSize(const Scalar stepSize) { stepSize_ = stepSize; }

  void setObjective(const Objective &) {}

  void setFiniteDifferences(const FiniteDifferences &) {}

  Scalar operator()(const Vector &, const Scalar, const Vector &) {
    return stepSize_;
  }
};

/** Step size functor to compute Barzilai-Borwein (BB) steps.
 * The functor can either compute the direct or inverse BB step.
 * The steps are computed as follows:
 *
 * s_k = x_k - x_k-1         k >= 1
 * y_k = grad_k - grad_k-1   k >= 1
 * Direct:  stepSize = (s_k^T * s_k) / (y_k^T * s_k)
 * Inverse: stepSize = (y_k^T * s_k) / (y_k^T * y_k)
 *
 * The very first step is computed as a constant. */
template <typename Scalar> class BarzilaiBorwein {
public:
  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, 1> Vector;
  typedef std::function<Scalar(const Vector &, Vector &)> Objective;
  typedef std::function<void(const Vector &, const Scalar, Vector &)>
      FiniteDifferences;

  enum class Method { Direct, Inverse };

private:
  Vector lastXval_;
  Vector lastGradient_;
  Method method_;
  Scalar constStep_;

  Scalar constantStep() const { return constStep_; }

  Scalar directStep(const Vector &xval, const Vector &gradient) {
    auto sk = xval - lastXval_;
    auto yk = gradient - lastGradient_;
    Scalar num = sk.dot(sk);
    Scalar denom = sk.dot(yk);

    if (denom == 0)
      return 1;
    else
      return std::abs(num / denom);
  }

  Scalar inverseStep(const Vector &xval, const Vector &gradient) {
    auto sk = xval - lastXval_;
    auto yk = gradient - lastGradient_;
    Scalar num = sk.dot(yk);
    Scalar denom = yk.dot(yk);

    if (denom == 0)
      return 1;
    else
      return std::abs(num / denom);
  }

public:
  BarzilaiBorwein() : BarzilaiBorwein(Method::Inverse, 1) {}

  BarzilaiBorwein(const Method method, const Scalar constStep)
      : lastXval_(), lastGradient_(), method_(method), constStep_(constStep) {}

  void setObjective(const Objective &) {}

  void setFiniteDifferences(const FiniteDifferences &) {}

  void setMethod(const Method method) { method_ = method; }

  void setConstStepSize(const Scalar stepSize) { constStep_ = stepSize; }

  Scalar operator()(const Vector &xval, const Scalar, const Vector &gradient) {
    Scalar stepSize = 0;
    if (lastXval_.size() == 0) {
      stepSize = constStep_;
    } else {
      switch (method_) {
      case Method::Direct:
        stepSize = directStep(xval, gradient);
        break;
      case Method::Inverse:
        stepSize = inverseStep(xval, gradient);
        break;
      default:
        assert(false);
        break;
      }
    }

    lastGradient_ = gradient;
    lastXval_ = xval;

    return stepSize;
  }
};

/** Step size functor to perform Armijo Linesearch with backtracking.
 * The functor iteratively decreases the step size until the following
 * conditions are met:
 *
 * Armijo: f(x - stepSize * grad(x)) <= f(x) - cArmijo * stepSize * grad(x)^T *
 * grad(x)
 *
 * If either condition does not hold the step size is decreased:
 *
 * stepSize = decrease * stepSize */
template <typename Scalar> class ArmijoBacktracking {
public:
  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, 1> Vector;
  typedef std::function<Scalar(const Vector &, Vector &)> Objective;
  typedef std::function<void(const Vector &, const Scalar, Vector &)>
      FiniteDifferences;

protected:
  Scalar decrease_;
  Scalar cArmijo_;
  Scalar minStep_;
  Scalar maxStep_;
  Index maxIt_;
  Objective objective_;
  FiniteDifferences finiteDifferences_;

  Scalar evaluateObjective(const Vector &xval, Vector &gradient) {
    gradient.resize(0);
    Scalar fval = objective_(xval, gradient);
    if (gradient.size() == 0)
      finiteDifferences_(xval, fval, gradient);
    return fval;
  }

  virtual bool computeSecondCondition(const Scalar, const Scalar, const Scalar,
                                      const Vector &, const Vector &) {
    return true;
  }

public:
  ArmijoBacktracking()
      : ArmijoBacktracking(0.8, 1e-4, 1e-20, 1, 0) {} // NOTE: maxStep was 1

  ArmijoBacktracking(const Scalar decrease, const Scalar cArmijo,
                     const Scalar minStep, const Scalar maxStep,
                     const Index iterations)
      : decrease_(decrease), cArmijo_(cArmijo), minStep_(minStep),
        maxStep_(maxStep), maxIt_(iterations), objective_() {
    assert(decrease > 0);
    assert(decrease < 1);
    assert(cArmijo > 0);
    assert(cArmijo < 0.5);
    assert(minStep < maxStep);
  }

  /** Set the decreasing factor for backtracking.
   * Assure that decrease in (0, 1).
   * @param decrease decreasing factor */
  void setBacktrackingDecrease(const Scalar decrease) {
    assert(decrease > 0);
    assert(decrease < 1);
    decrease_ = decrease;
  }

  /** Set the relaxation constant for the Armijo condition (see class
   * description).
   * Assure cArmijo in (0, 0.5).
   * @param cArmijo armijo constant */
  void setArmijoConstant(const Scalar cArmijo) {
    assert(cArmijo > 0);
    assert(cArmijo < 0.5);
    cArmijo_ = cArmijo;
  }

  /** Set the bounds for the step size during linesearch.
   * The final step size is guaranteed to be in [minStep, maxStep].
   * @param minStep minimum step size
   * @param maxStep maximum step size */
  void setStepBounds(const Scalar minStep, const Scalar maxStep) {
    assert(minStep < maxStep);
    minStep_ = minStep;
    maxStep_ = maxStep;
  }

  /** Set the maximum number of iterations.
   * Set to 0 or negative for infinite iterations.
   * @param iterations maximum number of iterations */
  void setMaxIterations(const Index iterations) { maxIt_ = iterations; }

  void setObjective(const Objective &objective) { objective_ = objective; }

  void setFiniteDifferences(const FiniteDifferences &finiteDifferences) {
    finiteDifferences_ = finiteDifferences;
  }

  Scalar operator()(const Vector &xval, const Scalar fval,
                    const Vector &gradient) {
    assert(objective_);
    assert(finiteDifferences_);

    Scalar stepSize = maxStep_ / decrease_;
    Vector gradientN;
    Vector xvalN;
    Scalar fvalN;
    bool armijoCondition = false;
    bool secondCondition = false;

    Index iterations = 0;
    while ((maxIt_ <= 0 || iterations < maxIt_) &&
           stepSize * decrease_ >= minStep_ &&
           !(armijoCondition && secondCondition)) {
      stepSize = decrease_ * stepSize;
      xvalN = xval - stepSize * gradient;
      fvalN = evaluateObjective(xvalN, gradientN);

      armijoCondition =
          fvalN <= fval - cArmijo_ * stepSize * gradient.dot(gradient);
      secondCondition =
          computeSecondCondition(stepSize, fval, fvalN, gradient, gradientN);

      ++iterations;
    }

    return stepSize;
  }
};

/** Step size functor to perform Wolfe Linesearch with backtracking.
 * The functor iteratively decreases the step size until the following
 * conditions are met:
 *
 * Armijo: f(x - stepSize * grad(x)) <= f(x) - cArmijo * stepSize * grad(x)^T *
 * grad(x) Wolfe: grad(x)^T grad(x - stepSize * grad(x)) <= cWolfe * grad(x)^T *
 * grad(x)
 *
 * If either condition does not hold the step size is decreased:
 *
 * stepSize = decrease * stepSize */
template <typename Scalar>
class WolfeBacktracking : public ArmijoBacktracking<Scalar> {
public:
  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, 1> Vector;
  typedef std::function<Scalar(const Vector &, Vector &)> Objective;
  typedef std::function<void(const Vector &, const Scalar, Vector &)>
      FiniteDifferences;

protected:
  Scalar cWolfe_;

  virtual bool computeSecondCondition(const Scalar, const Scalar, const Scalar,
                                      const Vector &gradient,
                                      const Vector &gradientN) {
    return gradient.dot(gradientN) <= cWolfe_ * gradient.dot(gradient);
  }

public:
  WolfeBacktracking() : WolfeBacktracking(0.8, 1e-4, 0.9, 1e-20, 1, 0) {}

  WolfeBacktracking(const Scalar decrease, const Scalar cArmijo,
                    const Scalar cWolfe, const Scalar minStep,
                    const Scalar maxStep, const Index iterations)
      : ArmijoBacktracking<Scalar>(decrease, cArmijo, minStep, maxStep,
                                   iterations),
        cWolfe_(cWolfe) {
    assert(cWolfe < 1);
    assert(cArmijo < cWolfe);
  }

  /** Set the wolfe constants for Armijo and Wolfe condition (see class
   * description).
   * Assure that c1 < c2 < 1 and c1 in (0, 0.5).
   * @param c1 armijo constant
   * @param c2 wolfe constant */
  void setWolfeConstant(const Scalar cWolfe) {
    assert(cWolfe < 1);
    cWolfe_ = cWolfe;
  }
};

/** Step size functor which searches for a step that reduces the function
 * value.
 * The functor iteratively decreases the step size until the following
 * condition is met:
 *
 * f(x - stepSize * grad) < f(x)
 *
 * If this condition does not hold the step size is decreased:
 *
 * stepSize = decrease * stepSize
 *
 * This functor does not require to compute any gradients and does not use
 * finite differences. */
template <typename Scalar> class DecreaseBacktracking {
public:
  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, 1> Vector;
  typedef std::function<Scalar(const Vector &, Vector &)> Objective;
  typedef std::function<void(const Vector &, const Scalar, Vector &)>
      FiniteDifferences;

private:
  Scalar decrease_;
  Scalar minStep_;
  Scalar maxStep_;
  Index maxIt_;
  Objective objective_;

public:
  DecreaseBacktracking() : DecreaseBacktracking(0.8, 1e-12, 1, 0) {}

  DecreaseBacktracking(const Scalar decrease, const Scalar minStep,
                       const Scalar maxStep, const Index iterations)
      : decrease_(decrease), minStep_(minStep), maxStep_(maxStep),
        maxIt_(iterations), objective_() {}

  /** Set the decreasing factor for backtracking.
   * Assure that decrease in (0, 1).
   * @param decrease decreasing factor */
  void setBacktrackingDecrease(const Scalar decrease) { decrease_ = decrease; }

  /** Set the bounds for the step size during linesearch.
   * The final step size is guaranteed to be in [minStep, maxStep].
   * @param minStep minimum step size
   * @param maxStep maximum step size */
  void setStepBounds(const Scalar minStep, const Scalar maxStep) {
    assert(minStep < maxStep);
    minStep_ = minStep;
    maxStep_ = maxStep;
  }

  /** Set the maximum number of iterations.
   * Set to 0 or negative for infinite iterations.
   * @param iterations maximum number of iterations */
  void setMaxIterations(const Index iterations) { maxIt_ = iterations; }

  void setObjective(const Objective &objective) { objective_ = objective; }

  void setFiniteDifferences(const FiniteDifferences &) {}

  Scalar operator()(const Vector &xval, const Scalar fval,
                    const Vector &gradient) {
    assert(objective_);

    Scalar stepSize = maxStep_ / decrease_;
    Vector xvalN;
    Vector gradientN;
    Scalar fvalN;
    bool improvement = false;

    Index iterations = 0;
    while ((maxIt_ <= 0 || iterations < maxIt_) &&
           stepSize * decrease_ >= minStep_ && !improvement) {
      stepSize = decrease_ * stepSize;
      xvalN = xval - stepSize * gradient;
      fvalN = objective_(xvalN, gradientN);

      improvement = fvalN < fval;

      ++iterations;
    }

    return stepSize;
  }
};

template <typename Scalar, typename Objective,
          typename StepSize = BarzilaiBorwein<Scalar>,
          typename Callback = NoCallback<Scalar>,
          typename FiniteDifferences = CentralDifferences<Scalar>>
class GradientDescent {
public:
  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, 1> Vector;

  struct Result {
    Index iterations;
    bool converged;
    Scalar fval;
    Vector xval;
  };

protected:
  Index maxIt_;
  Scalar minGradientLen_;
  Scalar minStepLen_;
  Scalar momentum_;
  Index verbosity_;
  Objective objective_;
  StepSize stepSize_;
  Callback callback_;
  FiniteDifferences finiteDifferences_;

  Scalar evaluateObjective(const Vector &xval, Vector &gradient) {
    gradient.resize(0);
    Scalar fval = objective_(xval, gradient);
    if (gradient.size() == 0)
      finiteDifferences_(xval, fval, gradient);
    return fval;
  }

  std::string vector2str(const Vector &vec) const {
    std::stringstream ss1;
    ss1 << std::fixed << std::showpoint << std::setprecision(16);
    std::stringstream ss2;
    ss2 << '[';
    for (Index i = 0; i < vec.size(); ++i) {
      ss1 << vec(i);
      ss2 << std::setfill(' ') << std::setw(10) << ss1.str();
      if (i != vec.size() - 1)
        ss2 << ' ';
      ss1.str("");
    }
    ss2 << ']';

    return ss2.str();
  }

public:
  GradientDescent()
      : maxIt_(0), minGradientLen_(static_cast<Scalar>(1e-2)),
        minStepLen_(static_cast<Scalar>(1e-6)), momentum_(0), verbosity_(0),
        objective_(), stepSize_(), callback_(), finiteDifferences_() {}

  ~GradientDescent() {}

  void setThreads(const Index threads) {
    finiteDifferences_.setThreads(threads);
  }

  void setNumericalEpsilon(const Scalar eps) {
    finiteDifferences_.setNumericalEpsilon(eps);
  }

  void setMaxIterations(const Index iterations) { maxIt_ = iterations; }

  void setObjective(const Objective &objective) { objective_ = objective; }

  void setCallback(const Callback &callback) { callback_ = callback; }

  void setMinGradientLength(const Scalar gradientLen) {
    minGradientLen_ = gradientLen;
  }

  void setMinStepLength(const Scalar stepLen) { minStepLen_ = stepLen; }

  void setStepSize(const StepSize stepSize) { stepSize_ = stepSize; }

  void setMomentum(const Scalar momentum) { momentum_ = momentum; }

  void setVerbosity(const Index verbosity) { verbosity_ = verbosity; }

  Result minimize(const Vector &initialGuess) {
    finiteDifferences_.setObjective([this](const Vector &xval) {
      Vector tmp;
      return this->objective_(xval, tmp);
    });
    stepSize_.setObjective([this](const Vector &xval, Vector &gradient) {
      return this->objective_(xval, gradient);
    });
    stepSize_.setFiniteDifferences(
        [this](const Vector &xval, const Scalar fval, Vector &gradient) {
          this->finiteDifferences_(xval, fval, gradient);
        });

    Vector xval = initialGuess;
    Vector gradient;
    Scalar fval;
    Scalar gradientLen = minGradientLen_ + 1;
    Scalar stepSize;
    Vector step = Vector::Zero(xval.size());
    Scalar stepLen = minStepLen_ + 1;
    bool callbackResult = true;

    Index iterations = 0;
    while ((maxIt_ <= 0 || iterations < maxIt_) &&
           gradientLen >= minGradientLen_ && stepLen >= minStepLen_ &&
           callbackResult) {
      xval -= step;
      fval = evaluateObjective(xval, gradient);
      gradientLen = gradient.norm();
      // update step according to step size and momentum
      stepSize = stepSize_(xval, fval, gradient);
      step = momentum_ * step + (1 - momentum_) * stepSize * gradient;
      stepLen = step.norm();
      // evaluate callback an save its result
      callbackResult = callback_(iterations, xval, fval, gradient);

      if (verbosity_ > 0) {
        std::stringstream ss;
        ss << "it=" << std::setfill('0') << std::setw(4) << iterations
           << std::fixed << std::showpoint << std::setprecision(20)
           << "    gradlen=" << gradientLen << "    stepsize=" << stepSize
           << "    steplen=" << stepLen;

        if (verbosity_ > 2)
          ss << "    callback=" << (callbackResult ? "true" : "false");

        ss << "    fval=" << fval;

        if (verbosity_ > 1)
          ss << "    xval=" << vector2str(xval);
        if (verbosity_ > 2)
          ss << "    gradient=" << vector2str(gradient);
        if (verbosity_ > 3)
          ss << "    step=" << vector2str(step);
        std::cout << ss.str() << std::endl;
      }

      ++iterations;
    }

    Result result;
    result.xval = xval;
    result.fval = fval;
    result.iterations = iterations;
    result.converged = gradientLen < minGradientLen_ || stepLen < minStepLen_;

    return result;
  }
};
} // namespace gdc

#endif