boost/math/optimization/cma_es.hpp

/*
 * Copyright Nick Thompson, 2024
 * Use, modification and distribution are subject to the
 * Boost Software License, Version 1.0. (See accompanying file
 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 */
#ifndef BOOST_MATH_OPTIMIZATION_CMA_ES_HPP
#define BOOST_MATH_OPTIMIZATION_CMA_ES_HPP
#include <atomic>
#include <cmath>
#include <iostream>
#include <limits>
#include <random>
#include <sstream>
#include <stdexcept>
#include <utility>
#include <vector>
#include <boost/math/optimization/detail/common.hpp>
#include <boost/math/tools/assert.hpp>
#if __has_include(<Eigen/Dense>)
#include <Eigen/Dense>
#else
#error "CMA-ES requires Eigen."
#endif

// Follows the notation in:
// https://arxiv.org/pdf/1604.00772.pdf
// This is a (hopefully) faithful reproduction of the pseudocode in the arxiv review
// by Nikolaus Hansen.
// Comments referring to equations all refer to this arxiv review.
// A slide deck by the same author is given here:
// http://www.cmap.polytechnique.fr/~nikolaus.hansen/CmaTutorialGecco2023-no-audio.pdf
// which is also a very useful reference.

#ifndef BOOST_MATH_DEBUG_CMA_ES
#define BOOST_MATH_DEBUG_CMA_ES 0
#endif

namespace boost::math::optimization {

template <typename ArgumentContainer> struct cma_es_parameters {
  using Real = typename ArgumentContainer::value_type;
  using DimensionlessReal = decltype(Real()/Real());
  ArgumentContainer lower_bounds;
  ArgumentContainer upper_bounds;
  size_t max_generations = 1000;
  ArgumentContainer const *initial_guess = nullptr;
  // In the reference, population size = \lambda.
  // If the population size is zero, it is set to equation (48) of the reference
  // and rounded up to the nearest multiple of threads:
  size_t population_size = 0;
  // In the reference, learning_rate = c_m:
  DimensionlessReal learning_rate = 1;
};

template <typename ArgumentContainer>
void validate_cma_es_parameters(cma_es_parameters<ArgumentContainer> &params) {
  using Real = typename ArgumentContainer::value_type;
  using DimensionlessReal = decltype(Real()/Real());
  using std::isfinite;
  using std::isnan;
  using std::log;
  using std::ceil;
  using std::floor;

  std::ostringstream oss;
  detail::validate_bounds(params.lower_bounds, params.upper_bounds);
  if (params.initial_guess) {
    detail::validate_initial_guess(*params.initial_guess, params.lower_bounds, params.upper_bounds);
  }
  const size_t n = params.upper_bounds.size();
  // Equation 48 of the arxiv review:
  if (params.population_size == 0) {
    //auto tmp = 4.0 + floor(3*log(n));
    // But round to the nearest multiple of the thread count:
    //auto k = static_cast<size_t>(std::ceil(tmp/params.threads));
    //params.population_size = k*params.threads;
    params.population_size = static_cast<size_t>(4 + floor(3*log(n)));
  }
  if (params.learning_rate <= DimensionlessReal(0) || !isfinite(params.learning_rate)) {
    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
    oss << ": The learning rate must be > 0, but got " << params.learning_rate << ".";
    throw std::invalid_argument(oss.str());
  }
}

template <typename ArgumentContainer, class Func, class URBG>
ArgumentContainer cma_es(
    const Func cost_function,
    cma_es_parameters<ArgumentContainer> &params,
    URBG &gen,
    std::invoke_result_t<Func, ArgumentContainer> target_value = std::numeric_limits<std::invoke_result_t<Func, ArgumentContainer>>::quiet_NaN(),
    std::atomic<bool> *cancellation = nullptr,
    std::atomic<std::invoke_result_t<Func, ArgumentContainer>> *current_minimum_cost = nullptr,
    std::vector<std::pair<ArgumentContainer, std::invoke_result_t<Func, ArgumentContainer>>> *queries = nullptr)
 {
  using Real = typename ArgumentContainer::value_type;
  using DimensionlessReal = decltype(Real()/Real());
  using ResultType = std::invoke_result_t<Func, ArgumentContainer>;
  using std::abs;
  using std::log;
  using std::exp;
  using std::pow;
  using std::min;
  using std::max;
  using std::sqrt;
  using std::isnan;
  using std::isfinite;
  using std::uniform_real_distribution;
  using std::normal_distribution;
  validate_cma_es_parameters(params);
  // n = dimension of problem:
  const size_t n = params.lower_bounds.size();
  std::atomic<bool> target_attained = false;
  std::atomic<ResultType> lowest_cost = std::numeric_limits<ResultType>::infinity();
  ArgumentContainer best_vector;
  // p_{c} := evolution path, equation (24) of the arxiv review:
  Eigen::Vector<DimensionlessReal, Eigen::Dynamic> p_c(n);
  // p_{\sigma} := conjugate evolution path, equation (31) of the arxiv review:
  Eigen::Vector<DimensionlessReal, Eigen::Dynamic> p_sigma(n);
  if constexpr (detail::has_resize_v<ArgumentContainer>) {
    best_vector.resize(n, std::numeric_limits<Real>::quiet_NaN());
  }
  for (size_t i = 0; i < n; ++i) {
    p_c[i] = DimensionlessReal(0);
    p_sigma[i] = DimensionlessReal(0);
  }
  // Table 1, \mu = floor(\lambda/2):
  size_t mu = params.population_size/2;
  std::vector<DimensionlessReal> w_prime(params.population_size, std::numeric_limits<DimensionlessReal>::quiet_NaN());
  for (size_t i = 0; i < params.population_size; ++i) {
    // Equation (49), but 0-indexed:
    w_prime[i] = log(static_cast<DimensionlessReal>(params.population_size + 1)/(2*(i+1)));
  }
  // Table 1, notes at top:
  DimensionlessReal positive_weight_sum = 0;
  DimensionlessReal sq_weight_sum = 0;
  for (size_t i = 0; i < mu; ++i) {
    BOOST_MATH_ASSERT(w_prime[i] > 0);
    positive_weight_sum += w_prime[i];
    sq_weight_sum += w_prime[i]*w_prime[i];
  }
  DimensionlessReal mu_eff = positive_weight_sum*positive_weight_sum/sq_weight_sum;
  BOOST_MATH_ASSERT(1 <= mu_eff);
  BOOST_MATH_ASSERT(mu_eff <= mu);
  DimensionlessReal negative_weight_sum = 0;
  sq_weight_sum = 0;
  for (size_t i = mu; i < params.population_size; ++i) {
    BOOST_MATH_ASSERT(w_prime[i] <= 0);
    negative_weight_sum += w_prime[i];
    sq_weight_sum += w_prime[i]*w_prime[i];
  }
  DimensionlessReal mu_eff_m = negative_weight_sum*negative_weight_sum/sq_weight_sum;
  // Equation (54):
  DimensionlessReal c_m = params.learning_rate;
  // Equation (55):
  DimensionlessReal c_sigma = (mu_eff + 2)/(n + mu_eff + 5);
  BOOST_MATH_ASSERT(c_sigma < 1);
  DimensionlessReal d_sigma = 1 + 2*(max)(DimensionlessReal(0), sqrt(DimensionlessReal((mu_eff - 1)/(n + 1))) - DimensionlessReal(1)) + c_sigma;
  // Equation (56):
  DimensionlessReal c_c = (4 + mu_eff/n)/(n + 4 + 2*mu_eff/n);
  BOOST_MATH_ASSERT(c_c <= 1);
  // Equation (57):
  DimensionlessReal c_1 = DimensionlessReal(2)/(pow(n + 1.3, 2) + mu_eff);
  // Equation (58)
  DimensionlessReal c_mu = (min)(1 - c_1, 2*(DimensionlessReal(0.25)  + mu_eff  + 1/mu_eff - 2)/((n+2)*(n+2) + mu_eff));
  BOOST_MATH_ASSERT(c_1 + c_mu <= DimensionlessReal(1));
  // Equation (50):
  DimensionlessReal alpha_mu_m = 1 + c_1/c_mu;
  // Equation (51):
  DimensionlessReal alpha_mu_eff_m = 1 + 2*mu_eff_m/(mu_eff + 2);
  // Equation (52):
  DimensionlessReal alpha_m_pos_def = (1- c_1 - c_mu)/(n*c_mu);
  // Equation (53):
  std::vector<DimensionlessReal> weights(params.population_size, std::numeric_limits<DimensionlessReal>::quiet_NaN());
  for (size_t i = 0; i < mu; ++i) {
    weights[i] = w_prime[i]/positive_weight_sum;
  }
  DimensionlessReal min_alpha = (min)(alpha_mu_m, (min)(alpha_mu_eff_m, alpha_m_pos_def));
  for (size_t i = mu; i < params.population_size; ++i) {
    weights[i] = min_alpha*w_prime[i]/abs(negative_weight_sum);
  }
  // mu:= number of parents, lambda := number of offspring.
  Eigen::Matrix<DimensionlessReal, Eigen::Dynamic, Eigen::Dynamic> C = Eigen::Matrix<DimensionlessReal, Eigen::Dynamic, Eigen::Dynamic>::Identity(n, n);
  ArgumentContainer mean_vector;
  // See the footnote in Figure 6 of the arxiv review:
  // We should consider the more robust initialization described there. . . 
  Real sigma = DimensionlessReal(0.3)*(params.upper_bounds[0] - params.lower_bounds[0]);;
  if (params.initial_guess) {
    mean_vector = *params.initial_guess;
  }
  else {
    mean_vector = detail::random_initial_population(params.lower_bounds, params.upper_bounds, 1, gen)[0];
  }
  auto initial_cost = cost_function(mean_vector);
  if (!isnan(initial_cost)) {
    best_vector = mean_vector;
    lowest_cost = initial_cost;
    if (current_minimum_cost) {
      *current_minimum_cost = initial_cost;
    }
  }
#if BOOST_MATH_DEBUG_CMA_ES
  {
    std::cout << __FILE__ << ":" << __LINE__ << ":" << __func__ << "\n";
    std::cout << "\tRunning a (" << params.population_size/2 << "/" << params.population_size/2 << "_W, " << params.population_size << ")-aCMA Evolutionary Strategy on " << params.threads << " threads.\n";
    std::cout << "\tInitial mean vector: {";
    for (size_t i = 0; i < n - 1; ++i) {
      std::cout << mean_vector[i] << ", ";
    }
    std::cout << mean_vector[n - 1] << "}.\n";
    std::cout << "\tCost: " << lowest_cost << ".\n";
    std::cout << "\tInitial step length: " << sigma << ".\n";
    std::cout << "\tVariance effective selection mass: " << mu_eff << ".\n";
    std::cout << "\tLearning rate for rank-one update of covariance matrix: " << c_1 << ".\n";
    std::cout << "\tLearning rate for rank-mu update of covariance matrix: " << c_mu << ".\n";
    std::cout << "\tDecay rate for cumulation path for step-size control: " << c_sigma << ".\n";
    std::cout << "\tLearning rate for the mean: " << c_m << ".\n";
    std::cout << "\tDamping parameter for step-size update: " << d_sigma << ".\n";
  }
#endif
  size_t generation = 0;

  std::vector<Eigen::Vector<DimensionlessReal, Eigen::Dynamic>> ys(params.population_size);
  std::vector<ArgumentContainer> xs(params.population_size);
  std::vector<ResultType> costs(params.population_size, std::numeric_limits<ResultType>::quiet_NaN());
  Eigen::Vector<DimensionlessReal, Eigen::Dynamic> weighted_avg_y(n);
  Eigen::Vector<DimensionlessReal, Eigen::Dynamic> z(n);
  if constexpr (detail::has_resize_v<ArgumentContainer>) {
    for (auto & x : xs) {
      x.resize(n, std::numeric_limits<Real>::quiet_NaN());
    }
  }
  for (auto & y : ys) {
    y.resize(n);
  }
  normal_distribution<DimensionlessReal> dis(DimensionlessReal(0), DimensionlessReal(1));
  do {
    if (cancellation && *cancellation) {
      break;
    }
    // TODO: The reference contends the following in
    // Section B.2 "Strategy internal numerical effort":
    // "In practice, the re-calculation of B and D needs to be done not until about
    // max(1, floor(1/(10n(c_1+c_mu)))) generations."
    // Note that sigma can be dimensionless, in which case C carries the units.
    // This is a weird decision-we're not gonna do that!
    Eigen::SelfAdjointEigenSolver<Eigen::Matrix<DimensionlessReal, Eigen::Dynamic, Eigen::Dynamic>> eigensolver(C);
    if (eigensolver.info() != Eigen::Success) {
      std::ostringstream oss;
      oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
      oss << ": Could not decompose the covariance matrix as BDB^{T}.";
      throw std::logic_error(oss.str());
    }
    Eigen::Matrix<DimensionlessReal, Eigen::Dynamic, Eigen::Dynamic> B = eigensolver.eigenvectors();
    // Eigen returns D^2, in the notation of the survey:
    auto D = eigensolver.eigenvalues();
    // So make it better:
    for (auto & d : D) {
      if (d <= 0 || isnan(d)) {
        std::ostringstream oss;
        oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
        oss << ": The covariance matrix is not positive definite. This breaks the evolution path computation downstream.\n";
        oss << "C=\n" << C << "\n";
        oss << "Eigenvalues: " << D;
        throw std::domain_error(oss.str());
      }
      d = sqrt(d);
    }

    for (size_t k = 0; k < params.population_size; ++k) {
      auto & y = ys[k];
      auto & x = xs[k];
      BOOST_MATH_ASSERT(static_cast<size_t>(x.size()) == n);
      BOOST_MATH_ASSERT(static_cast<size_t>(y.size()) == n);
      size_t resample_counter = 0;
      do {
        // equation (39) of Figure 6:
        // Also see equation (4):
        for (size_t i = 0; i < n; ++i) {
          z[i] = dis(gen);
        }
        Eigen::Vector<DimensionlessReal, Eigen::Dynamic> Dz(n);
        for (size_t i = 0; i < n; ++i) {
          Dz[i] = D[i]*z[i];
        }
        y = B*Dz;
        for (size_t i = 0; i < n; ++i) {
          BOOST_MATH_ASSERT(!isnan(mean_vector[i]));
          BOOST_MATH_ASSERT(!isnan(y[i]));
          x[i] = mean_vector[i] + sigma*y[i]; // equation (40) of Figure 6.
        }
        costs[k] = cost_function(x);
        if (resample_counter++ == 50) {
          std::ostringstream oss;
          oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
          oss << ": 50 resamples was not sufficient to find an argument to the cost function which did not return NaN.";
          oss << " Giving up.";
          throw std::domain_error(oss.str());
        }
      } while (isnan(costs[k]));

      if (queries) {
        queries->emplace_back(std::make_pair(x, costs[k]));
      }
      if (costs[k] < lowest_cost) {
        lowest_cost = costs[k];
        best_vector = x;
        if (current_minimum_cost && costs[k] < *current_minimum_cost) {
          *current_minimum_cost = costs[k];
        }
        if (lowest_cost < target_value) {
          target_attained = true;
          break;
        }
      }
    }
    if (target_attained) {
      break;
    }
    if (cancellation && *cancellation) {
      break;
    }
    auto indices = detail::best_indices(costs);
    // Equation (41), Figure 6:
    for (size_t j = 0; j < n; ++j) {
      weighted_avg_y[j] = 0;
      for (size_t i = 0; i < mu; ++i) {
        BOOST_MATH_ASSERT(!isnan(weights[i]));
        BOOST_MATH_ASSERT(!isnan(ys[indices[i]][j]));
        weighted_avg_y[j] += weights[i]*ys[indices[i]][j];
      }
    }
    // Equation (42), Figure 6:
    for (size_t j = 0; j < n; ++j) {
      mean_vector[j] = mean_vector[j] + c_m*sigma*weighted_avg_y[j];
    }
    // Equation (43), Figure 6: Start with C^{-1/2}<y>_{w}
    Eigen::Vector<DimensionlessReal, Eigen::Dynamic> inv_D_B_transpose_y = B.transpose()*weighted_avg_y;
    for (long j = 0; j < inv_D_B_transpose_y.size(); ++j) {
      inv_D_B_transpose_y[j] /= D[j];
    }
    Eigen::Vector<DimensionlessReal, Eigen::Dynamic> C_inv_sqrt_y_avg = B*inv_D_B_transpose_y;
    // Equation (43), Figure 6:
    DimensionlessReal p_sigma_norm = 0;
    for (size_t j = 0; j < n; ++j) {
      p_sigma[j] = (1-c_sigma)*p_sigma[j] + sqrt(c_sigma*(2-c_sigma)*mu_eff)*C_inv_sqrt_y_avg[j];
      p_sigma_norm += p_sigma[j]*p_sigma[j];
    }
    p_sigma_norm = sqrt(p_sigma_norm);
    // A: Algorithm Summary: E[||N(0,1)||]:
    const DimensionlessReal expectation_norm_0I = sqrt(static_cast<DimensionlessReal>(n))*(DimensionlessReal(1) - DimensionlessReal(1)/(4*n) + DimensionlessReal(1)/(21*n*n));
    // Equation (44), Figure 6:
    sigma = sigma*exp(c_sigma*(p_sigma_norm/expectation_norm_0I -1)/d_sigma);
    // A: Algorithm Summary:
    DimensionlessReal h_sigma = 0;
    DimensionlessReal rhs = (DimensionlessReal(1.4) + DimensionlessReal(2)/(n+1))*expectation_norm_0I*sqrt(1 - pow(1-c_sigma, 2*(generation+1)));
    if (p_sigma_norm < rhs) {
      h_sigma = 1;
    }
    // Equation (45), Figure 6:
    p_c = (1-c_c)*p_c + h_sigma*sqrt(c_c*(2-c_c)*mu_eff)*weighted_avg_y;
    DimensionlessReal delta_h_sigma = (1-h_sigma)*c_c*(2-c_c);
    DimensionlessReal weight_sum = 0;
    for (auto & w : weights) {
      weight_sum += w;
    }
    // Equation (47), Figure 6:
    DimensionlessReal K = (1 + c_1*delta_h_sigma - c_1 - c_mu*weight_sum);
    // Can these operations be sped up using `.selfadjointView<Eigen::Upper>`?
    // Maybe: A.selfadjointView<Eigen::Lower>().rankUpdate(p_c, c_1);?
    C = K*C + c_1*p_c*p_c.transpose();
    // Incorporate positive weights of Equation (46):
    for (size_t i = 0; i < params.population_size/2; ++i) {
      C += c_mu*weights[i]*ys[indices[i]]*ys[indices[i]].transpose();
    }
    for (size_t i = params.population_size/2; i < params.population_size; ++i) {
      Eigen::Vector<DimensionlessReal, Eigen::Dynamic> D_inv_BTy = B.transpose()*ys[indices[i]];
      for (size_t j = 0; j < n; ++j) {
        D_inv_BTy[j] /= D[j];
      }
      DimensionlessReal squared_norm = D_inv_BTy.squaredNorm();
      DimensionlessReal K2 = c_mu*weights[i]/squared_norm;
      C += K2*ys[indices[i]]*ys[indices[i]].transpose();
    }
  } while (generation++ < params.max_generations);

  return best_vector;
}

} // namespace boost::math::optimization
#endif
Boost C++ Libraries

boost/math/optimization/cma_es.hpp