From 6f36107f82f76788ba2cd3fb666456275545bc86 Mon Sep 17 00:00:00 2001 From: tnagler Date: Thu, 14 Nov 2019 14:24:59 +0100 Subject: [PATCH 1/7] isolate KdeFFT class --- .gitignore | 1 + inst/include/kde1d/dpik.hpp | 168 ++++++++++------------------------ inst/include/kde1d/kdefft.hpp | 122 ++++++++++++++++++++++++ 3 files changed, 171 insertions(+), 120 deletions(-) create mode 100644 inst/include/kde1d/kdefft.hpp diff --git a/.gitignore b/.gitignore index d252a03..a4a57ef 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ .Ruserdata revdep /revdep/.cache.rds +.vscode diff --git a/inst/include/kde1d/dpik.hpp b/inst/include/kde1d/dpik.hpp index cd8156f..7319074 100644 --- a/inst/include/kde1d/dpik.hpp +++ b/inst/include/kde1d/dpik.hpp @@ -1,7 +1,7 @@ #pragma once +#include "kdefft.hpp" #include "stats.hpp" -#include #define _USE_MATH_DEFINES #include @@ -13,132 +13,67 @@ namespace bw { //! Methodology is similar to Sheather and Jones(1991), but asymptotic //! bias/variance expressions are adapted for higher-order polynomials and //! nearest neighbor bandwidths. -class PluginBandwidthSelector { +class PluginBandwidthSelector +{ public: PluginBandwidthSelector(const Eigen::VectorXd& x, const Eigen::VectorXd& weights = Eigen::VectorXd()); double select_bw(size_t deg); private: - Eigen::VectorXd linbin(const Eigen::VectorXd& x, - const Eigen::VectorXd& weights); double scale_est(const Eigen::VectorXd& x); - Eigen::VectorXd kde_drv(size_t drv); - void set_bw_for_bkfe(size_t drv); - double bkfe(size_t drv); + double get_bw_for_bkfe(size_t drv); double ll_ibias2(size_t deg); double ll_ivar(size_t deg); - size_t num_bins_{ 401 }; - double bw_ { NAN }; - double lower_; - double upper_; + fft::KdeFFT kde_; Eigen::VectorXd weights_; Eigen::VectorXd bin_counts_; double scale_; }; - //! @param x vector of observations. //! @param weigths optional vector of weights for each observation. -PluginBandwidthSelector::PluginBandwidthSelector(const Eigen::VectorXd& x, - const Eigen::VectorXd& weights) - : lower_(x.minCoeff()) - , upper_(x.maxCoeff()) - , weights_(weights) +inline PluginBandwidthSelector::PluginBandwidthSelector( + const Eigen::VectorXd& x, + const Eigen::VectorXd& weights) + : weights_(weights) + , kde_(fft::KdeFFT(x, 0.0, weights)) { - if (weights.size() > 0 && (weights.size() != x.size())) - throw std::runtime_error("x and weights must have the same size"); if (weights.size() == 0) { weights_ = Eigen::VectorXd::Ones(x.size()); } else { weights_ = weights_ * x.size() / weights_.sum(); } - bin_counts_ = linbin(x, weights_); + bin_counts_ = kde_.get_bin_counts(); scale_ = scale_est(x); } -//! Computes bin counts for univariate data via the linear binning strategy. -//! @param x vector of observations -//! @param weights vector of weights for each observation. -inline Eigen::VectorXd PluginBandwidthSelector::linbin( - const Eigen::VectorXd& x, const Eigen::VectorXd& weights) -{ - Eigen::VectorXd gcnts = Eigen::VectorXd::Zero(num_bins_); - double rem, lxi, delta; - - delta = (upper_ - lower_) / (num_bins_ - 1.0); - for (size_t i = 0; i < x.size(); ++i) { - lxi = (x(i) - lower_) / delta + 1.0; - size_t li = static_cast(lxi); - rem = lxi - li; - if (li >= 1 && li < num_bins_) { - gcnts(li - 1) += (1 - rem) * weights(i); - gcnts(li) += rem * weights(i); - } - } - - return gcnts; -} - //! Scale estimate (minimum of standard deviation and robust equivalent) //! @param x vector of observations. -double PluginBandwidthSelector::scale_est(const Eigen::VectorXd& x) +inline double +PluginBandwidthSelector::scale_est(const Eigen::VectorXd& x) { double m_x = x.cwiseProduct(weights_).mean(); Eigen::VectorXd sx = (x - Eigen::VectorXd::Constant(x.size(), m_x)); - double sd_x = std::sqrt( - sx.cwiseAbs2().cwiseProduct(weights_).sum()/(x.size() - 1)); + double sd_x = + std::sqrt(sx.cwiseAbs2().cwiseProduct(weights_).sum() / (x.size() - 1)); Eigen::VectorXd q_x(2); q_x << 0.25, 0.75; q_x = stats::quantile(x, q_x, weights_); - double scale = std::min((q_x(1) - q_x(0))/1.349, sd_x); + double scale = std::min((q_x(1) - q_x(0)) / 1.349, sd_x); if (scale == 0) { scale = (sd_x > 0) ? sd_x : 1.0; } return scale; } - -//! Binned kernel density derivative estimate -//! @param drv order of derivative. -//! @return estimated derivative evaluated at the bin centers. -Eigen::VectorXd PluginBandwidthSelector::kde_drv(size_t drv) -{ - double delta = (upper_ - lower_) / (num_bins_ - 1.0); - double tau = 4.0 + drv; - size_t L = std::floor(tau * bw_ / delta); - L = std::min(L, num_bins_); - - double tmp_dbl = L * delta / bw_; - Eigen::VectorXd arg = Eigen::VectorXd::LinSpaced(L + 1, 0.0, tmp_dbl); - tmp_dbl = std::pow(bw_, drv + 1.0); - arg = stats::dnorm_drv(arg, drv) / (tmp_dbl * bin_counts_.sum()); - - tmp_dbl = num_bins_ + L + 1.0; - tmp_dbl = std::pow(2, std::ceil(std::log(tmp_dbl) / std::log(2))); - size_t P = static_cast(tmp_dbl); - - Eigen::VectorXd arg2 = Eigen::VectorXd::Zero(P); - arg2.head(L + 1) = arg; - arg2.tail(L) = arg.tail(L).reverse() * (drv % 2 ? -1.0 : 1.0); - - Eigen::VectorXd x2 = Eigen::VectorXd::Zero(P); - x2.head(num_bins_) = bin_counts_; - - Eigen::FFT fft; - Eigen::VectorXcd tmp1 = fft.fwd(arg2); - Eigen::VectorXcd tmp2 = fft.fwd(x2); - tmp1 = tmp1.cwiseProduct(tmp2); - tmp2 = fft.inv(tmp1); - return tmp2.head(num_bins_).real(); -} - //! optimal bandwidths for kernel functionals (see Wand and Jones' book, 3.5) //! only works for even drv //! @param drv order of the derivative in the kernel functional. -void PluginBandwidthSelector::set_bw_for_bkfe(size_t drv) +inline double +PluginBandwidthSelector::get_bw_for_bkfe(size_t drv) { if (drv % 2 != 0) { throw std::runtime_error("only even drv allowed."); @@ -149,57 +84,49 @@ void PluginBandwidthSelector::set_bw_for_bkfe(size_t drv) // start with normal reference rule (eq 3.7) int r = drv + 4; - double psi =((r/2) % 2 == 0) ? 1 : -1; + double psi = ((r / 2) % 2 == 0) ? 1 : -1; psi *= std::tgamma(r + 1); - psi /= std::pow(2 * scale_, r + 1) * std::tgamma(r/2 + 1) * std::sqrt(M_PI); + psi /= std::pow(2 * scale_, r + 1) * std::tgamma(r / 2 + 1) * std::sqrt(M_PI); double Kr = stats::dnorm_drv(Eigen::VectorXd::Zero(1), r - 2)(0); - bw_ = std::pow(-2 * Kr / (psi * n), 1.0 / (r + 1)); + kde_.set_bw(std::pow(-2 * Kr / (psi * n), 1.0 / (r + 1))); // now use plug in to select the actual bandwidth (eq. 3.6) r -= 2; - psi = bkfe(drv + 2); - Kr = stats::dnorm_drv(Eigen::VectorXd::Zero(1), r - 2)(0); - - bw_ = std::pow(-2 * Kr / (psi * n), 1.0 / (r + 1)); -} + // that's bkfe() + psi = bin_counts_.cwiseProduct(kde_.kde_drv(drv + 2)).sum(); + psi /= bin_counts_.sum(); + Kr = stats::dnorm_drv(Eigen::VectorXd::Zero(1), r - 2)(0); -//! Binned Kernel Functional Estimate -//! @param x vector of bin counts -//! @param drv order of derivative in the density functional -//! @param h kernel bandwidth -//! @param a minimum value of x at which to compute the estimate -//! @param b maximum value of x at which to compute the estimate -//! @return the estimated functional -double PluginBandwidthSelector::bkfe(size_t drv) -{ - return bin_counts_.cwiseProduct(kde_drv(drv)).sum() / bin_counts_.sum(); + return std::pow(-2 * Kr / (psi * n), 1.0 / (r + 1)); } //! computes the integrated squared bias (without bw and n terms). //! Bias expressions can be found in Geenens (JASA, 2014) //! @param deg degree of the local polynomial. -double PluginBandwidthSelector::ll_ibias2(size_t deg) +inline double +PluginBandwidthSelector::ll_ibias2(size_t deg) { Eigen::VectorXd arg; if (deg == 0) { - set_bw_for_bkfe(4); - arg = 0.25 * kde_drv(4); + kde_.set_bw(get_bw_for_bkfe(4)); + arg = 0.25 * kde_.kde_drv(4); } else if (deg == 1) { - set_bw_for_bkfe(4); - Eigen::VectorXd f0 = kde_drv(0); - Eigen::VectorXd f1 = kde_drv(1); - Eigen::VectorXd f2 = kde_drv(2); + kde_.set_bw(get_bw_for_bkfe(4)); + Eigen::VectorXd f0 = kde_.kde_drv(0); + Eigen::VectorXd f1 = kde_.kde_drv(1); + Eigen::VectorXd f2 = kde_.kde_drv(2); arg = (0.5 * f2 + f1.cwiseAbs2().cwiseQuotient(f0)) - .cwiseAbs2().cwiseQuotient(f0); + .cwiseAbs2() + .cwiseQuotient(f0); } else if (deg == 2) { - set_bw_for_bkfe(8); - Eigen::VectorXd f0 = kde_drv(0); - Eigen::VectorXd f1 = kde_drv(1); - Eigen::VectorXd f2 = kde_drv(2); - Eigen::VectorXd f4 = kde_drv(4); + kde_.set_bw(get_bw_for_bkfe(8)); + Eigen::VectorXd f0 = kde_.kde_drv(0); + Eigen::VectorXd f1 = kde_.kde_drv(1); + Eigen::VectorXd f2 = kde_.kde_drv(2); + Eigen::VectorXd f4 = kde_.kde_drv(4); arg = f4 - 3 * f2.cwiseAbs2().cwiseQuotient(f0) + - 2 * (f1.array().pow(4) / f0.array().pow(3)).matrix(); + 2 * (f1.array().pow(4) / f0.array().pow(3)).matrix(); arg = (0.125 * arg).cwiseAbs2().cwiseQuotient(f0); } else { throw std::runtime_error("deg must be one of {0, 1, 2}."); @@ -209,8 +136,9 @@ double PluginBandwidthSelector::ll_ibias2(size_t deg) //! computes the integrated squared variance (without bw and n terms). //! Variance expressions can be found in Geenens (JASA, 2014) -//! @param deg degree of the local polynomial. -double PluginBandwidthSelector::ll_ivar(size_t deg) + //! @param deg degree of the local polynomial. +inline double +PluginBandwidthSelector::ll_ivar(size_t deg) { if (deg > 2) throw std::runtime_error("deg must be one of {0, 1, 2}."); @@ -219,7 +147,8 @@ double PluginBandwidthSelector::ll_ivar(size_t deg) //! Selects the bandwidth for kernel density estimation. //! @param deg degree of the local polynomial. -inline double PluginBandwidthSelector::select_bw(size_t deg) +inline double +PluginBandwidthSelector::select_bw(size_t deg) { // effective sample size double n = std::pow(weights_.sum(), 2) / weights_.cwiseAbs2().sum(); @@ -227,7 +156,7 @@ inline double PluginBandwidthSelector::select_bw(size_t deg) int bwpow = (deg < 2 ? 4 : 8); try { double ibias2 = ll_ibias2(deg); - double ivar = ll_ivar(deg); + double ivar = ll_ivar(deg); bw = std::pow(ivar / (bwpow * n * ibias2), 1.0 / (bwpow + 1)); } catch (...) { bw = 4.0 * 1.06 * scale_ * std::pow(n, -1.0 / (bwpow + 1)); @@ -239,4 +168,3 @@ inline double PluginBandwidthSelector::select_bw(size_t deg) } // end kde1d::bw } // end kde1d - diff --git a/inst/include/kde1d/kdefft.hpp b/inst/include/kde1d/kdefft.hpp new file mode 100644 index 0000000..c833750 --- /dev/null +++ b/inst/include/kde1d/kdefft.hpp @@ -0,0 +1,122 @@ +#pragma once + +#include "stats.hpp" +#include + +namespace kde1d { + +namespace fft { + +//! Bandwidth selection for local-likelihood density estimation. +//! Methodology is similar to Sheather and Jones(1991), but asymptotic +//! bias/variance expressions are adapted for higher-order polynomials and +//! nearest neighbor bandwidths. +class KdeFFT +{ +public: + KdeFFT(const Eigen::VectorXd& x, + double bw, + const Eigen::VectorXd& weights = Eigen::VectorXd()); + + Eigen::VectorXd kde_drv(size_t drv) const; + Eigen::VectorXd get_bin_counts() const { return bin_counts_; }; + void set_bw(double bw) { bw_ = bw; }; + +private: + Eigen::VectorXd linbin(const Eigen::VectorXd& x, + double lower, + double upper, + const Eigen::VectorXd& weights) const; + + double bw_; + Eigen::VectorXd bin_counts_; + double lower_; + double upper_; + size_t num_bins_{ 400 }; +}; + +//! @param x vector of observations. +//! @param weigths optional vector of weights for each observation. +inline KdeFFT::KdeFFT(const Eigen::VectorXd& x, + double bw, + const Eigen::VectorXd& weights) +{ + if (weights.size() > 0 && (weights.size() != x.size())) + throw std::runtime_error("x and weights must have the same size"); + + Eigen::VectorXd w; + if (weights.size() > 0) { + w = weights / weights.mean(); + } else { + w = Eigen::VectorXd::Ones(x.size()); + } + + lower_ = x.minCoeff() - 3 * bw; + upper_ = x.maxCoeff() + 3 * bw; + bin_counts_ = linbin(x, lower_, upper_, w); +} + +//! Computes bin counts for univariate data via the linear binning strategy. +//! @param x vector of observations +//! @param weights vector of weights for each observation. +inline Eigen::VectorXd +KdeFFT::linbin(const Eigen::VectorXd& x, + double lower, + double upper, + const Eigen::VectorXd& weights) const +{ + Eigen::VectorXd gcnts = Eigen::VectorXd::Zero(num_bins_ + 1); + double rem, lxi, delta; + + delta = (upper_ - lower_) / num_bins_; + for (size_t i = 0; i < x.size(); ++i) { + lxi = (x(i) - lower_) / delta; + size_t li = static_cast(lxi); + rem = lxi - li; + if (li < num_bins_) { + gcnts(li) += (1 - rem) * weights(i); + gcnts(li + 1) += rem * weights(i); + } + } + + return gcnts; +} + +//! Binned kernel density derivative estimate +//! @param drv order of derivative. +//! @return estimated derivative evaluated at the bin centers. +inline Eigen::VectorXd +KdeFFT::kde_drv(size_t drv) const +{ + double delta = (upper_ - lower_) / num_bins_; + double tau = 4.0 + drv; + size_t L = std::floor(tau * bw_ / delta); + L = std::min(L, num_bins_); + + double tmp_dbl = L * delta / bw_; + Eigen::VectorXd arg = Eigen::VectorXd::LinSpaced(L + 1, 0.0, tmp_dbl); + tmp_dbl = std::pow(bw_, drv + 1.0); + arg = stats::dnorm_drv(arg, drv) / (tmp_dbl * bin_counts_.sum()); + + tmp_dbl = num_bins_ + L + 1.0; + tmp_dbl = std::pow(2, std::ceil(std::log(tmp_dbl) / std::log(2))); + size_t P = static_cast(tmp_dbl); + + Eigen::VectorXd arg2 = Eigen::VectorXd::Zero(P); + arg2.head(L + 1) = arg; + arg2.tail(L) = arg.tail(L).reverse() * (drv % 2 ? -1.0 : 1.0); + + Eigen::VectorXd x2 = Eigen::VectorXd::Zero(P); + x2.head(num_bins_) = bin_counts_; + + Eigen::FFT fft; + Eigen::VectorXcd tmp1 = fft.fwd(arg2); + Eigen::VectorXcd tmp2 = fft.fwd(x2); + tmp1 = tmp1.cwiseProduct(tmp2); + tmp2 = fft.inv(tmp1); + return tmp2.head(num_bins_).real(); +} + +} // end kde1d::bw + +} // end kde1d From 5b495a1b39e19ba577c47937d570591f3b5fad5e Mon Sep 17 00:00:00 2001 From: tnagler Date: Thu, 14 Nov 2019 16:44:16 +0100 Subject: [PATCH 2/7] almost done with fft --- inst/include/kde1d/kde1d.hpp | 300 +++++++++++++++------------------- inst/include/kde1d/kdefft.hpp | 13 +- inst/include/kde1d/stats.hpp | 66 ++++---- 3 files changed, 173 insertions(+), 206 deletions(-) diff --git a/inst/include/kde1d/kde1d.hpp b/inst/include/kde1d/kde1d.hpp index 72233a2..70c4525 100644 --- a/inst/include/kde1d/kde1d.hpp +++ b/inst/include/kde1d/kde1d.hpp @@ -1,16 +1,17 @@ #pragma once +#include "kde1d/dpik.hpp" #include "kde1d/interpolation.hpp" #include "kde1d/stats.hpp" #include "kde1d/tools.hpp" -#include "kde1d/dpik.hpp" -#include #include +#include namespace kde1d { //! Local-polynomial density estimation in 1-d. -class Kde1d { +class Kde1d +{ public: // constructors Kde1d() {} @@ -31,19 +32,18 @@ class Kde1d { Eigen::VectorXd pdf(const Eigen::VectorXd& x) const; Eigen::VectorXd cdf(const Eigen::VectorXd& x) const; Eigen::VectorXd quantile(const Eigen::VectorXd& x) const; - Eigen::VectorXd simulate(size_t n, - const std::vector& seeds = {}) const; + Eigen::VectorXd simulate(size_t n, const std::vector& seeds = {}) const; // getters Eigen::VectorXd get_values() const { return grid_.get_values(); } Eigen::VectorXd get_grid_points() const { return grid_.get_grid_points(); } size_t get_nlevels() const { return nlevels_; } - double get_bw() const {return bw_;} - double get_deg() const {return deg_;} - double get_xmin() const {return xmin_;} - double get_xmax() const {return xmax_;} - double get_edf() const {return edf_;} - double get_loglik() const {return loglik_;} + double get_bw() const { return bw_; } + double get_deg() const { return deg_; } + double get_xmin() const { return xmin_; } + double get_xmax() const { return xmax_; } + double get_edf() const { return edf_; } + double get_loglik() const { return loglik_; } private: // data members @@ -51,10 +51,10 @@ class Kde1d { size_t nlevels_; double xmin_; double xmax_; - double bw_{NAN}; - size_t deg_{2}; - double loglik_{NAN}; - double edf_{NAN}; + double bw_{ NAN }; + size_t deg_{ 2 }; + double loglik_{ NAN }; + double edf_{ NAN }; static constexpr double K0_ = 0.3989425; // private methods @@ -67,8 +67,7 @@ class Kde1d { void check_levels(const Eigen::VectorXd& x) const; Eigen::VectorXd kern_gauss(const Eigen::VectorXd& x); - Eigen::MatrixXd fit_lp(const Eigen::VectorXd& x_ev, - const Eigen::VectorXd& x, + Eigen::MatrixXd fit_lp(const Eigen::VectorXd& x, const Eigen::VectorXd& weights); double calculate_infl(const size_t& n, const double& f0, @@ -80,12 +79,14 @@ class Kde1d { bool inverse = false); Eigen::VectorXd boundary_correct(const Eigen::VectorXd& x, const Eigen::VectorXd& fhat); - Eigen::VectorXd construct_grid_points(const Eigen::VectorXd& x, - const Eigen::VectorXd& weights); + Eigen::VectorXd construct_grid_points(const Eigen::VectorXd& x); Eigen::VectorXd finalize_grid(Eigen::VectorXd& grid_points); Eigen::VectorXd without_boundary_ext(const Eigen::VectorXd& grid_points); double select_bw(const Eigen::VectorXd& x, - double bw, double mult, size_t deg, size_t nlevels, + double bw, + double mult, + size_t deg, + size_t nlevels, const Eigen::VectorXd& weights) const; }; @@ -131,19 +132,16 @@ inline Kde1d::Kde1d(const Eigen::VectorXd& x, w /= w.sum(); if (nlevels_ > 0) xx = stats::equi_jitter(xx); + xx = boundary_transform(xx); // bandwidth selection - bw_ = select_bw(boundary_transform(xx), bw_, mult, deg, nlevels_, w); - - // construct grid on original domain - Eigen::VectorXd grid_points = construct_grid_points(xx, w); + bw_ = select_bw(xx, bw_, mult, deg, nlevels_, w); // fit model and evaluate in transformed domain - Eigen::MatrixXd fitted = fit_lp(boundary_transform(grid_points), - boundary_transform(xx), - w); + Eigen::MatrixXd fitted = fit_lp(xx, w); // correct estimated density for transformation + Eigen::VectorXd grid_points = construct_grid_points(xx); Eigen::VectorXd values = boundary_correct(grid_points, fitted.col(0)); // move boundary points to xmin/xmax @@ -151,10 +149,7 @@ inline Kde1d::Kde1d(const Eigen::VectorXd& x, // construct interpolation grid // (3 iterations for normalization to a proper density) - grid_ = interp::InterpolationGrid1d(grid_points, values, 3); - - // store normalized values - values = grid_.get_values(); + grid_ = interp::InterpolationGrid1d(grid_points, values, 0); // calculate log-likelihood of final estimate loglik_ = grid_.interpolate(x).cwiseMax(1e-20).array().log().sum(); @@ -162,12 +157,10 @@ inline Kde1d::Kde1d(const Eigen::VectorXd& x, // calculate effective degrees of freedom double n = x.size(); interp::InterpolationGrid1d infl_grid( - without_boundary_ext(grid_points), - without_boundary_ext(fitted.col(1).cwiseMin(2.0).cwiseMax(0)), 0); + grid_points, fitted.col(1).cwiseMin(2.0).cwiseMax(0), 0); edf_ = infl_grid.interpolate(x).sum(); } - //! construct model from an already fit interpolation grid. //! @param grid the interpolation grid. //! @param nlevels number of factor levels; 0 for continuous variables. @@ -188,12 +181,14 @@ inline Kde1d::Kde1d(const interp::InterpolationGrid1d& grid, //! computes the pdf of the kernel density estimate by interpolation. //! @param x vector of evaluation points. //! @return a vector of pdf values. -inline Eigen::VectorXd Kde1d::pdf(const Eigen::VectorXd& x) const +inline Eigen::VectorXd +Kde1d::pdf(const Eigen::VectorXd& x) const { return (nlevels_ == 0) ? pdf_continuous(x) : pdf_discrete(x); } -inline Eigen::VectorXd Kde1d::pdf_continuous(const Eigen::VectorXd& x) const +inline Eigen::VectorXd +Kde1d::pdf_continuous(const Eigen::VectorXd& x) const { Eigen::VectorXd fhat = grid_.interpolate(x); if (!std::isnan(xmin_)) { @@ -203,11 +198,13 @@ inline Eigen::VectorXd Kde1d::pdf_continuous(const Eigen::VectorXd& x) const fhat = (x.array() > xmax_).select(Eigen::VectorXd::Zero(x.size()), fhat); } - auto trunc = [] (const double& xx) { return std::max(xx, 0.0); }; - return tools::unaryExpr_or_nan(fhat, trunc);; + auto trunc = [](const double& xx) { return std::max(xx, 0.0); }; + return tools::unaryExpr_or_nan(fhat, trunc); + ; } -inline Eigen::VectorXd Kde1d::pdf_discrete(const Eigen::VectorXd& x) const +inline Eigen::VectorXd +Kde1d::pdf_discrete(const Eigen::VectorXd& x) const { check_levels(x); auto fhat = pdf_continuous(x); @@ -220,17 +217,20 @@ inline Eigen::VectorXd Kde1d::pdf_discrete(const Eigen::VectorXd& x) const //! computes the cdf of the kernel density estimate by numerical integration. //! @param x vector of evaluation points. //! @return a vector of cdf values. -inline Eigen::VectorXd Kde1d::cdf(const Eigen::VectorXd& x) const +inline Eigen::VectorXd +Kde1d::cdf(const Eigen::VectorXd& x) const { return (nlevels_ == 0) ? cdf_continuous(x) : cdf_discrete(x); } -inline Eigen::VectorXd Kde1d::cdf_continuous(const Eigen::VectorXd& x) const +inline Eigen::VectorXd +Kde1d::cdf_continuous(const Eigen::VectorXd& x) const { return grid_.integrate(x, /* normalize */ true); } -inline Eigen::VectorXd Kde1d::cdf_discrete(const Eigen::VectorXd& x) const +inline Eigen::VectorXd +Kde1d::cdf_discrete(const Eigen::VectorXd& x) const { check_levels(x); Eigen::VectorXd lvs = Eigen::VectorXd::LinSpaced(nlevels_, 0, nlevels_ - 1); @@ -238,24 +238,25 @@ inline Eigen::VectorXd Kde1d::cdf_discrete(const Eigen::VectorXd& x) const for (size_t i = 1; i < nlevels_; ++i) f_cum(i) += f_cum(i - 1); - return tools::unaryExpr_or_nan(x, [&f_cum] (const double& xx) { - return f_cum(static_cast(xx)); - }); + return tools::unaryExpr_or_nan( + x, [&f_cum](const double& xx) { return f_cum(static_cast(xx)); }); } //! computes the cdf of the kernel density estimate by numerical inversion. //! @param x vector of evaluation points. //! @return a vector of quantiles. -inline Eigen::VectorXd Kde1d::quantile(const Eigen::VectorXd& x) const +inline Eigen::VectorXd +Kde1d::quantile(const Eigen::VectorXd& x) const { if ((x.minCoeff() < 0) | (x.maxCoeff() > 1)) throw std::runtime_error("probabilities must lie in (0, 1)."); return (nlevels_ == 0) ? quantile_continuous(x) : quantile_discrete(x); } -inline Eigen::VectorXd Kde1d::quantile_continuous(const Eigen::VectorXd& x) const +inline Eigen::VectorXd +Kde1d::quantile_continuous(const Eigen::VectorXd& x) const { - auto cdf = [&] (const Eigen::VectorXd& xx) { return grid_.integrate(xx); }; + auto cdf = [&](const Eigen::VectorXd& xx) { return grid_.integrate(xx); }; auto q = tools::invert_f(x, cdf, grid_.get_grid_points().minCoeff(), @@ -271,11 +272,12 @@ inline Eigen::VectorXd Kde1d::quantile_continuous(const Eigen::VectorXd& x) cons return q; } -inline Eigen::VectorXd Kde1d::quantile_discrete(const Eigen::VectorXd& x) const +inline Eigen::VectorXd +Kde1d::quantile_discrete(const Eigen::VectorXd& x) const { Eigen::VectorXd lvs = Eigen::VectorXd::LinSpaced(nlevels_, 0, nlevels_ - 1); auto p = cdf(lvs); - auto quan = [&] (const double& pp) { + auto quan = [&](const double& pp) { size_t lv = 0; while ((pp >= p(lv)) && (lv < nlevels_ - 1)) lv++; @@ -289,14 +291,15 @@ inline Eigen::VectorXd Kde1d::quantile_discrete(const Eigen::VectorXd& x) const //! @param n the number of observations to simulate. //! @param seeds an optional vector of seeds. //! @return simulated observations from the kernel density. -inline Eigen::VectorXd Kde1d::simulate(size_t n, - const std::vector& seeds) const +inline Eigen::VectorXd +Kde1d::simulate(size_t n, const std::vector& seeds) const { auto u = stats::simulate_uniform(n, seeds); return this->quantile(u); } -inline void Kde1d::check_levels(const Eigen::VectorXd& x) const +inline void +Kde1d::check_levels(const Eigen::VectorXd& x) const { if (nlevels_ == 0) return; @@ -312,9 +315,10 @@ inline void Kde1d::check_levels(const Eigen::VectorXd& x) const //! Gaussian kernel (truncated at +/- 5). //! @param x vector of evaluation points. -inline Eigen::VectorXd Kde1d::kern_gauss(const Eigen::VectorXd& x) +inline Eigen::VectorXd +Kde1d::kern_gauss(const Eigen::VectorXd& x) { - auto f = [] (double xx) { + auto f = [](double xx) { // truncate at +/- 5 if (std::fabs(xx) > 5.0) return 0.0; @@ -331,69 +335,42 @@ inline Eigen::VectorXd Kde1d::kern_gauss(const Eigen::VectorXd& x) //! @param weights vector of weights for each observation (can be empty). //! @return a two-column matrix containing the density estimate in the first //! and the influence function in the second column. -inline Eigen::MatrixXd Kde1d::fit_lp(const Eigen::VectorXd& x_ev, - const Eigen::VectorXd& x, - const Eigen::VectorXd& weights) +inline Eigen::MatrixXd +Kde1d::fit_lp(const Eigen::VectorXd& x, const Eigen::VectorXd& weights) { - Eigen::MatrixXd res(x_ev.size(), 2); size_t n = x.size(); - - double f0, f1, b; - double s = bw_; double w0 = 1.0; - Eigen::VectorXd xx(x.size()); - Eigen::VectorXd xx2(x.size()); - Eigen::VectorXd kernels(x.size()); - for (size_t k = 0; k < x_ev.size(); k++) { - double s = bw_; - // classical (local constant) kernel density estimate - xx = (x.array() - x_ev(k)) / bw_; - kernels = kern_gauss(xx) / bw_; - if (weights.size() > 0) - kernels = kernels.cwiseProduct(weights); - f0 = kernels.mean(); - res(k, 0) = f0; - - // Before continuing with higher-order polynomials, check - // (local constant) influence. If it is close to one, there is only one - // observation contributing to the estimate and it is evaluated close to - // it. To avoid numerical issues in this case, we just use the - // local constant estimate. - if (weights.size()) { - // find weight corresponding to observation closest to x_ev(k) - w0 = weights(tools::find_min_index(xx.array().abs())); - } - res(k, 1) = K0_ * w0 / (n * bw_) / f0; - if (res(k, 1) > 0.95) { - continue; - } - if (deg_ > 0) { - // calculate b for local linear - xx /= bw_; - f1 = xx.cwiseProduct(kernels).mean(); // first order derivative - b = f1 / f0; - - if (deg_ == 2) { - // more calculations for local quadratic - xx2 = xx.cwiseProduct(kernels) / (f0 * static_cast(n)); - b *= std::pow(bw_, 2); - s = 1.0 / (std::pow(bw_, 4) * xx.transpose() * xx2 - b*b); - res(k, 0) *= bw_ * std::sqrt(s); - } - - // final estimate - res(k, 0) *= std::exp(-0.5 * std::pow(b, 2) * s); - res(k, 1) = calculate_infl(n, f0, b, bw_, s, w0); - - if (std::isnan(res(k)) | std::isinf(res(k))) { - // inverse operation might go wrong due to rounding when - // true value is equal or close to zero - res(k, 0) = 0.0; - res(k, 1) = 0.0; - } - } + fft::KdeFFT kde_fft(x, bw_, weights); + Eigen::VectorXd f0 = kde_fft.kde_drv(0); + + Eigen::MatrixXd res(f0.size(), 2); + res.col(0) = f0; + res.col(1) = K0_ * w0 / (n * bw_) * f0.cwiseInverse(); + if (deg_ == 0) + return res; + + // deg > 0 + Eigen::VectorXd f1 = kde_fft.kde_drv(1); + Eigen::VectorXd S = Eigen::VectorXd::Constant(f0.size(), bw_); + Eigen::VectorXd b = f1.cwiseQuotient(f0); + if (deg_ == 2) { + Eigen::VectorXd f2 = kde_fft.kde_drv(2); + // D/R is notation from Hjort and Jones' AoS paper + Eigen::VectorXd D = f2.cwiseQuotient(f0) - b.cwiseProduct(b); + Eigen::VectorXd R = 1 / (1.0 + bw_ * bw_ * D.array()).sqrt(); + // this is our notation + S = (R / bw_).array().pow(2); + b *= bw_ * bw_; + res.col(0) = bw_ * S.cwiseSqrt().cwiseProduct(res.col(0)); + } + res.col(0) = res.col(0).array() * (-0.5 * b.array().pow(2) * S.array()).exp(); + for (size_t k = 0; k < f0.size(); k++) { + // TODO: weights + res(k, 1) = calculate_infl(n, f0(k), b(k), bw_, S(k), w0); + if (std::isnan(res(k, 0))) + res.row(k).setZero(); } return res; @@ -401,26 +378,28 @@ inline Eigen::MatrixXd Kde1d::fit_lp(const Eigen::VectorXd& x_ev, //! calculate influence for data point for density estimate based on //! quantities pre-computed in `fit_lp()`. -inline double Kde1d::calculate_infl(const size_t &n, - const double& f0, - const double& b, - const double& bw, - const double& s, - const double& weight) +inline double +Kde1d::calculate_infl(const size_t& n, + const double& f0, + const double& b, + const double& bw, + const double& s, + const double& weight) { - Eigen::MatrixXd M; + double M_inverse00; double bw2 = std::pow(bw, 2); double b2 = std::pow(b, 2); if (deg_ == 0) { - M = Eigen::MatrixXd::Constant(1, 1, f0); + M_inverse00 = 1 / f0; } else if (deg_ == 1) { - M = Eigen::MatrixXd(2, 2); + Eigen::Matrix2d M; M(0, 0) = f0; M(0, 1) = bw2 * b * f0; M(1, 0) = M(0, 1); M(1, 1) = f0 * bw2 + f0 * bw2 * bw2 * b2; + M_inverse00 = M.inverse()(0, 0); } else if (deg_ == 2) { - M = Eigen::MatrixXd(3, 3); + Eigen::Matrix3d M; M(0, 0) = f0; M(0, 1) = f0 * b; M(1, 0) = M(0, 1); @@ -428,21 +407,21 @@ inline double Kde1d::calculate_infl(const size_t &n, M(1, 2) = 0.5 * f0 * (3.0 / s * b + b * b2); M(2, 1) = M(1, 2); M(2, 2) = 0.25 * f0; - M(2, 2) *= 3.0 / std::pow(s, 2) + 6.0 / s * b2 + b2 * b2; + M(2, 2) *= 3.0 / std::pow(s, 2) + 6.0 / s * b2 + b2 * b2; M(0, 2) = M(2, 2); M(2, 0) = M(2, 2); + M_inverse00 = M.inverse()(0, 0); } - return K0_ * weight / (n * bw) * M.inverse()(0, 0); + return K0_ * weight / (n * bw) * M_inverse00; } - //! transformations for density estimates with bounded support. //! @param x evaluation points. //! @param inverse whether the inverse transformation should be applied. //! @return the transformed evaluation points. -inline Eigen::VectorXd Kde1d::boundary_transform(const Eigen::VectorXd& x, - bool inverse) +inline Eigen::VectorXd +Kde1d::boundary_transform(const Eigen::VectorXd& x, bool inverse) { Eigen::VectorXd x_new = x; if (!inverse) { @@ -465,7 +444,7 @@ inline Eigen::VectorXd Kde1d::boundary_transform(const Eigen::VectorXd& x, // two boundaries -> probit transform auto rng = xmax_ - xmin_; x_new = stats::pnorm(x).array() + xmin_ - 5e-5 * rng; - x_new *= (xmax_ - xmin_ + 1e-4 * rng); + x_new *= (xmax_ - xmin_ + 1e-4 * rng); } else if (!std::isnan(xmin_)) { // left boundary -> log transform x_new = x.array().exp() + xmin_ - 1e-3; @@ -485,8 +464,8 @@ inline Eigen::VectorXd Kde1d::boundary_transform(const Eigen::VectorXd& x, //! @param x evaluation points (in original domain). //! @param fhat the density estimate evaluated in the transformed domain. //! @return corrected density estimates at `x`. -inline Eigen::VectorXd Kde1d::boundary_correct(const Eigen::VectorXd& x, - const Eigen::VectorXd& fhat) +inline Eigen::VectorXd +Kde1d::boundary_correct(const Eigen::VectorXd& x, const Eigen::VectorXd& fhat) { Eigen::VectorXd corr_term(fhat.size()); if (!std::isnan(xmin_) & !std::isnan(xmax_)) { @@ -512,44 +491,22 @@ inline Eigen::VectorXd Kde1d::boundary_correct(const Eigen::VectorXd& x, //! constructs a grid that is later used for interpolation. //! @param x vector of observations. //! @return a grid of size 50. -inline Eigen::VectorXd Kde1d::construct_grid_points( - const Eigen::VectorXd& x, - const Eigen::VectorXd& weights) +inline Eigen::VectorXd +Kde1d::construct_grid_points(const Eigen::VectorXd& x) { - // hybrid grid: quantiles and two equally spaced points between them - auto qgrid = stats::quantile( - x, Eigen::VectorXd::LinSpaced(14, 0, 1), weights); - size_t grid_size = 53; - Eigen::VectorXd inner_grid(grid_size); - for (unsigned i = 0; i < qgrid.size() - 1; i++) { - inner_grid.segment(i * 4, 5) = - Eigen::VectorXd::LinSpaced(5, qgrid(i), qgrid(i + 1)); - } - - // extend grid where there's no boundary - double range = inner_grid.maxCoeff() - inner_grid.minCoeff(); - Eigen::VectorXd lowr_ext, uppr_ext; - if (std::isnan(xmin_)) { - // no left boundary -> add a few points to the left - lowr_ext = Eigen::VectorXd(2); - lowr_ext[1] = inner_grid[0] - 1 * bw_; - lowr_ext[0] = inner_grid[0] - 2 * bw_; - } - if (std::isnan(xmax_)) { - // no right boundary -> add a few points to the right - uppr_ext = Eigen::VectorXd(2); - uppr_ext[0] = inner_grid[grid_size - 1] + 1 * bw_; - uppr_ext[1] = inner_grid[grid_size - 1] + 2 * bw_; - } - - Eigen::VectorXd grid_points(grid_size + uppr_ext.size() + lowr_ext.size()); - grid_points << lowr_ext, inner_grid, uppr_ext; - return grid_points; + Eigen::VectorXd xx(2); + xx << x.minCoeff(), x.maxCoeff(); + Eigen::VectorXd rng = boundary_transform(xx); + rng(0) -= 3 * bw_; + rng(1) += 3 * bw_; + auto gr = Eigen::VectorXd::LinSpaced(401, rng(0), rng(1)); + return boundary_transform(gr, true); } //! moves the boundary points of the grid to xmin/xmax (if non-NaN). //! @param grid_points the grid points. -inline Eigen::VectorXd Kde1d::finalize_grid(Eigen::VectorXd& grid_points) +inline Eigen::VectorXd +Kde1d::finalize_grid(Eigen::VectorXd& grid_points) { if (!std::isnan(xmin_)) grid_points(0) = xmin_; @@ -562,8 +519,8 @@ inline Eigen::VectorXd Kde1d::finalize_grid(Eigen::VectorXd& grid_points) //! removes the boundary extension from the grid_points (see //! `construct_grid_points`). //! @param grid_points the grid points. -inline Eigen::VectorXd Kde1d::without_boundary_ext( - const Eigen::VectorXd& grid_points) +inline Eigen::VectorXd +Kde1d::without_boundary_ext(const Eigen::VectorXd& grid_points) { size_t grid_start = 0; size_t grid_size = grid_points.size(); @@ -587,10 +544,13 @@ inline Eigen::VectorXd Kde1d::without_boundary_ext( //' @param deg polynomial degree. //' @return the selected bandwidth //' @noRd -inline double Kde1d::select_bw(const Eigen::VectorXd& x, - double bw, double mult, size_t deg, - size_t nlevels, - const Eigen::VectorXd& weights) const +inline double +Kde1d::select_bw(const Eigen::VectorXd& x, + double bw, + double mult, + size_t deg, + size_t nlevels, + const Eigen::VectorXd& weights) const { if (std::isnan(bw)) { bw::PluginBandwidthSelector selector(x, weights); diff --git a/inst/include/kde1d/kdefft.hpp b/inst/include/kde1d/kdefft.hpp index c833750..eca7ff1 100644 --- a/inst/include/kde1d/kdefft.hpp +++ b/inst/include/kde1d/kdefft.hpp @@ -40,6 +40,7 @@ class KdeFFT inline KdeFFT::KdeFFT(const Eigen::VectorXd& x, double bw, const Eigen::VectorXd& weights) + : bw_(bw) { if (weights.size() > 0 && (weights.size() != x.size())) throw std::runtime_error("x and weights must have the same size"); @@ -51,8 +52,8 @@ inline KdeFFT::KdeFFT(const Eigen::VectorXd& x, w = Eigen::VectorXd::Ones(x.size()); } - lower_ = x.minCoeff() - 3 * bw; - upper_ = x.maxCoeff() + 3 * bw; + lower_ = x.minCoeff() - 4 * bw; + upper_ = x.maxCoeff() + 4 * bw; bin_counts_ = linbin(x, lower_, upper_, w); } @@ -91,14 +92,14 @@ KdeFFT::kde_drv(size_t drv) const double delta = (upper_ - lower_) / num_bins_; double tau = 4.0 + drv; size_t L = std::floor(tau * bw_ / delta); - L = std::min(L, num_bins_); + L = std::min(L, num_bins_ + 1); double tmp_dbl = L * delta / bw_; Eigen::VectorXd arg = Eigen::VectorXd::LinSpaced(L + 1, 0.0, tmp_dbl); tmp_dbl = std::pow(bw_, drv + 1.0); arg = stats::dnorm_drv(arg, drv) / (tmp_dbl * bin_counts_.sum()); - tmp_dbl = num_bins_ + L + 1.0; + tmp_dbl = num_bins_ + L + 2.0; tmp_dbl = std::pow(2, std::ceil(std::log(tmp_dbl) / std::log(2))); size_t P = static_cast(tmp_dbl); @@ -107,14 +108,14 @@ KdeFFT::kde_drv(size_t drv) const arg2.tail(L) = arg.tail(L).reverse() * (drv % 2 ? -1.0 : 1.0); Eigen::VectorXd x2 = Eigen::VectorXd::Zero(P); - x2.head(num_bins_) = bin_counts_; + x2.head(num_bins_ + 1) = bin_counts_; Eigen::FFT fft; Eigen::VectorXcd tmp1 = fft.fwd(arg2); Eigen::VectorXcd tmp2 = fft.fwd(x2); tmp1 = tmp1.cwiseProduct(tmp2); tmp2 = fft.inv(tmp1); - return tmp2.head(num_bins_).real(); + return tmp2.head(num_bins_ + 1).real(); } } // end kde1d::bw diff --git a/inst/include/kde1d/stats.hpp b/inst/include/kde1d/stats.hpp index 4526324..af16768 100644 --- a/inst/include/kde1d/stats.hpp +++ b/inst/include/kde1d/stats.hpp @@ -2,11 +2,12 @@ #define BOOST_MATH_PROMOTE_DOUBLE_POLICY false #include +#include #include #include -#include -#include #include +#include +#include "tools.hpp" namespace kde1d { @@ -16,19 +17,20 @@ namespace stats { //! standard normal density //! @param x evaluation points. //! @return matrix of pdf values. -inline Eigen::MatrixXd dnorm(const Eigen::MatrixXd& x) +inline Eigen::MatrixXd +dnorm(const Eigen::MatrixXd& x) { boost::math::normal dist; - return x.unaryExpr([&dist](const double& y) { - return boost::math::pdf(dist, y); - }); + return x.unaryExpr( + [&dist](const double& y) { return boost::math::pdf(dist, y); }); }; //! standard normal density //! @param x evaluation points. //! @param drv order of the derivative //! @return matrix of pdf values. -inline Eigen::MatrixXd dnorm_drv(const Eigen::MatrixXd& x, unsigned drv) +inline Eigen::MatrixXd +dnorm_drv(const Eigen::MatrixXd& x, unsigned drv) { boost::math::normal dist; double rt2 = std::sqrt(2); @@ -46,31 +48,31 @@ inline Eigen::MatrixXd dnorm_drv(const Eigen::MatrixXd& x, unsigned drv) //! standard normal cdf //! @param x evaluation points. //! @return matrix of cdf values. -inline Eigen::MatrixXd pnorm(const Eigen::MatrixXd& x) +inline Eigen::MatrixXd +pnorm(const Eigen::MatrixXd& x) { boost::math::normal dist; - return x.unaryExpr([&dist](const double& y) { - return boost::math::cdf(dist, y); - }); + return x.unaryExpr( + [&dist](const double& y) { return boost::math::cdf(dist, y); }); }; //! standard normal quantiles //! @param x evaluation points. //! @return matrix of quantiles. -inline Eigen::MatrixXd qnorm(const Eigen::MatrixXd& x) +inline Eigen::MatrixXd +qnorm(const Eigen::MatrixXd& x) { boost::math::normal dist; - return x.unaryExpr([&dist](const double& y) { - return boost::math::quantile(dist, y); - }); + return x.unaryExpr( + [&dist](const double& y) { return boost::math::quantile(dist, y); }); }; //! empirical quantiles //! @param x data. //! @param q evaluation points. //! @return vector of quantiles. -inline Eigen::VectorXd quantile(const Eigen::VectorXd& x, - const Eigen::VectorXd& q) +inline Eigen::VectorXd +quantile(const Eigen::VectorXd& x, const Eigen::VectorXd& q) { double n = static_cast(x.size() - 1); size_t m = q.size(); @@ -96,14 +98,15 @@ inline Eigen::VectorXd quantile(const Eigen::VectorXd& x, //! @param q evaluation points. //! @param w vector of weights. //! @return vector of quantiles. -inline Eigen::VectorXd quantile(const Eigen::VectorXd& x, - const Eigen::VectorXd& q, - const Eigen::VectorXd& w) +inline Eigen::VectorXd +quantile(const Eigen::VectorXd& x, + const Eigen::VectorXd& q, + const Eigen::VectorXd& w) { if (w.size() == 0) - return quantile(x, q); + return quantile(x, q); if (w.size() != x.size()) - throw std::runtime_error("x and w must have the same size"); + throw std::runtime_error("x and w must have the same size"); double n = static_cast(x.size()); size_t m = q.size(); Eigen::VectorXd res(m); @@ -112,8 +115,8 @@ inline Eigen::VectorXd quantile(const Eigen::VectorXd& x, std::vector ind(n); for (size_t i = 0; i < n; ++i) ind[i] = i; - std::sort(ind.begin(), ind.end(), - [&x] (size_t i, size_t j) { return x(i) < x(j); }); + std::sort( + ind.begin(), ind.end(), [&x](size_t i, size_t j) { return x(i) < x(j); }); auto x2 = x; auto wcum = w; @@ -125,15 +128,16 @@ inline Eigen::VectorXd quantile(const Eigen::VectorXd& x, wacc += w(ind[i]); } - double wsum = w.sum() - w(ind[n - 1]);; + double wsum = w.sum() - w(ind[n - 1]); + ; for (size_t j = 0; j < m; ++j) { size_t i = 1; while ((wcum(i) < q(j) * wsum) & (i < n)) i++; res(j) = x2(i - 1); if (w(ind[i - 1]) > 1e-30) { - res(j) += (x2(i) - x2(i - 1)) * - (q(j) - wcum(i - 1) / wsum) / w(ind[i - 1]); + res(j) += + (x2(i) - x2(i - 1)) * (q(j) - wcum(i - 1) / wsum) / w(ind[i - 1]); } } @@ -145,7 +149,8 @@ inline Eigen::VectorXd quantile(const Eigen::VectorXd& x, // noise <- unname(unlist(lapply(tab, function(l) -0.5 + 1:l / (l + 1)))) // s <- sort(x, index.return = TRUE) // return((s$x + noise)[rank(x, ties.method = "first", na.last = "keep")]) -inline Eigen::VectorXd equi_jitter(const Eigen::VectorXd& x) +inline Eigen::VectorXd +equi_jitter(const Eigen::VectorXd& x) { size_t n = x.size(); @@ -198,12 +203,13 @@ inline Eigen::VectorXd equi_jitter(const Eigen::VectorXd& x) //! //! @return An size n vector of independent \f$ \mathrm{U}[0, 1] \f$ random //! variables. -inline Eigen::VectorXd simulate_uniform(size_t n, std::vector seeds) +inline Eigen::VectorXd +simulate_uniform(size_t n, std::vector seeds) { if (n < 1) throw std::runtime_error("n must be at least 1."); - if (seeds.size() == 0) { // no seeds provided, seed randomly + if (seeds.size() == 0) { // no seeds provided, seed randomly std::random_device rd{}; seeds = std::vector(5); for (auto& s : seeds) From c1194e423443bb12e867b503a1efd353ed805546 Mon Sep 17 00:00:00 2001 From: tnagler Date: Thu, 14 Nov 2019 20:13:38 +0100 Subject: [PATCH 3/7] proper weights in edf calculation --- inst/include/kde1d/kde1d.hpp | 23 ++++++++++++++++------- inst/include/kde1d/kdefft.hpp | 33 ++------------------------------- inst/include/kde1d/tools.hpp | 26 ++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 38 deletions(-) diff --git a/inst/include/kde1d/kde1d.hpp b/inst/include/kde1d/kde1d.hpp index 70c4525..d91665a 100644 --- a/inst/include/kde1d/kde1d.hpp +++ b/inst/include/kde1d/kde1d.hpp @@ -338,15 +338,24 @@ Kde1d::kern_gauss(const Eigen::VectorXd& x) inline Eigen::MatrixXd Kde1d::fit_lp(const Eigen::VectorXd& x, const Eigen::VectorXd& weights) { - size_t n = x.size(); - double w0 = 1.0; - fft::KdeFFT kde_fft(x, bw_, weights); Eigen::VectorXd f0 = kde_fft.kde_drv(0); + Eigen::VectorXd wbin = Eigen::VectorXd::Ones(f0.size()); + if (weights.size()) { + // compute the average weight per cell + auto wcount = kde_fft.get_bin_counts(); + auto count = tools::linbin(x, + x.minCoeff() - 4 * bw_, + x.maxCoeff() + 4 * bw_, + f0.size() - 1, + wbin); + wbin = wcount.cwiseQuotient(count); + } + Eigen::MatrixXd res(f0.size(), 2); res.col(0) = f0; - res.col(1) = K0_ * w0 / (n * bw_) * f0.cwiseInverse(); + res.col(1) = K0_ / (x.size() * bw_) * wbin.cwiseQuotient(f0); if (deg_ == 0) return res; @@ -368,7 +377,7 @@ Kde1d::fit_lp(const Eigen::VectorXd& x, const Eigen::VectorXd& weights) for (size_t k = 0; k < f0.size(); k++) { // TODO: weights - res(k, 1) = calculate_infl(n, f0(k), b(k), bw_, S(k), w0); + res(k, 1) = calculate_infl(x.size(), f0(k), b(k), bw_, S(k), wbin(k)); if (std::isnan(res(k, 0))) res.row(k).setZero(); } @@ -497,8 +506,8 @@ Kde1d::construct_grid_points(const Eigen::VectorXd& x) Eigen::VectorXd xx(2); xx << x.minCoeff(), x.maxCoeff(); Eigen::VectorXd rng = boundary_transform(xx); - rng(0) -= 3 * bw_; - rng(1) += 3 * bw_; + rng(0) -= 4 * bw_; + rng(1) += 4 * bw_; auto gr = Eigen::VectorXd::LinSpaced(401, rng(0), rng(1)); return boundary_transform(gr, true); } diff --git a/inst/include/kde1d/kdefft.hpp b/inst/include/kde1d/kdefft.hpp index eca7ff1..da44c33 100644 --- a/inst/include/kde1d/kdefft.hpp +++ b/inst/include/kde1d/kdefft.hpp @@ -1,5 +1,6 @@ #pragma once +#include "tools.hpp" #include "stats.hpp" #include @@ -23,11 +24,6 @@ class KdeFFT void set_bw(double bw) { bw_ = bw; }; private: - Eigen::VectorXd linbin(const Eigen::VectorXd& x, - double lower, - double upper, - const Eigen::VectorXd& weights) const; - double bw_; Eigen::VectorXd bin_counts_; double lower_; @@ -54,34 +50,9 @@ inline KdeFFT::KdeFFT(const Eigen::VectorXd& x, lower_ = x.minCoeff() - 4 * bw; upper_ = x.maxCoeff() + 4 * bw; - bin_counts_ = linbin(x, lower_, upper_, w); + bin_counts_ = tools::linbin(x, lower_, upper_, num_bins_, w); } -//! Computes bin counts for univariate data via the linear binning strategy. -//! @param x vector of observations -//! @param weights vector of weights for each observation. -inline Eigen::VectorXd -KdeFFT::linbin(const Eigen::VectorXd& x, - double lower, - double upper, - const Eigen::VectorXd& weights) const -{ - Eigen::VectorXd gcnts = Eigen::VectorXd::Zero(num_bins_ + 1); - double rem, lxi, delta; - - delta = (upper_ - lower_) / num_bins_; - for (size_t i = 0; i < x.size(); ++i) { - lxi = (x(i) - lower_) / delta; - size_t li = static_cast(lxi); - rem = lxi - li; - if (li < num_bins_) { - gcnts(li) += (1 - rem) * weights(i); - gcnts(li + 1) += rem * weights(i); - } - } - - return gcnts; -} //! Binned kernel density derivative estimate //! @param drv order of derivative. diff --git a/inst/include/kde1d/tools.hpp b/inst/include/kde1d/tools.hpp index feae09d..768770b 100644 --- a/inst/include/kde1d/tools.hpp +++ b/inst/include/kde1d/tools.hpp @@ -102,6 +102,32 @@ inline Eigen::Matrix get_order( return order; } +//! Computes bin counts for univariate data via the linear binning strategy. +//! @param x vector of observations +//! @param weights vector of weights for each observation. +inline Eigen::VectorXd linbin(const Eigen::VectorXd& x, + double lower, + double upper, + size_t num_bins, + const Eigen::VectorXd& weights) +{ + Eigen::VectorXd gcnts = Eigen::VectorXd::Zero(num_bins + 1); + double rem, lxi, delta; + + delta = (upper - lower) / num_bins; + for (size_t i = 0; i < x.size(); ++i) { + lxi = (x(i) - lower) / delta; + size_t li = static_cast(lxi); + rem = lxi - li; + if (li < num_bins) { + gcnts(li) += (1 - rem) * weights(i); + gcnts(li + 1) += rem * weights(i); + } + } + + return gcnts; +} + } // end kde1d tools } // end kde1d From 2c2e5a26d49a3f4f6caa8c69c64f95286c538a09 Mon Sep 17 00:00:00 2001 From: tnagler Date: Thu, 14 Nov 2019 22:00:06 +0100 Subject: [PATCH 4/7] fix grid related issues --- inst/include/kde1d/dpik.hpp | 2 +- inst/include/kde1d/kde1d.hpp | 62 ++++++++++++++++++++--------------- inst/include/kde1d/kdefft.hpp | 12 +++++-- 3 files changed, 45 insertions(+), 31 deletions(-) diff --git a/inst/include/kde1d/dpik.hpp b/inst/include/kde1d/dpik.hpp index 7319074..7bb5bd2 100644 --- a/inst/include/kde1d/dpik.hpp +++ b/inst/include/kde1d/dpik.hpp @@ -38,7 +38,7 @@ inline PluginBandwidthSelector::PluginBandwidthSelector( const Eigen::VectorXd& x, const Eigen::VectorXd& weights) : weights_(weights) - , kde_(fft::KdeFFT(x, 0.0, weights)) + , kde_(fft::KdeFFT(x, 0.0, x.minCoeff(), x.maxCoeff(), weights)) { if (weights.size() == 0) { weights_ = Eigen::VectorXd::Ones(x.size()); diff --git a/inst/include/kde1d/kde1d.hpp b/inst/include/kde1d/kde1d.hpp index d91665a..db4fb99 100644 --- a/inst/include/kde1d/kde1d.hpp +++ b/inst/include/kde1d/kde1d.hpp @@ -68,6 +68,7 @@ class Kde1d void check_levels(const Eigen::VectorXd& x) const; Eigen::VectorXd kern_gauss(const Eigen::VectorXd& x); Eigen::MatrixXd fit_lp(const Eigen::VectorXd& x, + const Eigen::VectorXd& grid, const Eigen::VectorXd& weights); double calculate_infl(const size_t& n, const double& f0, @@ -138,10 +139,10 @@ inline Kde1d::Kde1d(const Eigen::VectorXd& x, bw_ = select_bw(xx, bw_, mult, deg, nlevels_, w); // fit model and evaluate in transformed domain - Eigen::MatrixXd fitted = fit_lp(xx, w); + Eigen::VectorXd grid_points = construct_grid_points(xx); + Eigen::MatrixXd fitted = fit_lp(xx, boundary_transform(grid_points), w); // correct estimated density for transformation - Eigen::VectorXd grid_points = construct_grid_points(xx); Eigen::VectorXd values = boundary_correct(grid_points, fitted.col(0)); // move boundary points to xmin/xmax @@ -149,7 +150,7 @@ inline Kde1d::Kde1d(const Eigen::VectorXd& x, // construct interpolation grid // (3 iterations for normalization to a proper density) - grid_ = interp::InterpolationGrid1d(grid_points, values, 0); + grid_ = interp::InterpolationGrid1d(grid_points, values, 3); // calculate log-likelihood of final estimate loglik_ = grid_.interpolate(x).cwiseMax(1e-20).array().log().sum(); @@ -336,19 +337,22 @@ Kde1d::kern_gauss(const Eigen::VectorXd& x) //! @return a two-column matrix containing the density estimate in the first //! and the influence function in the second column. inline Eigen::MatrixXd -Kde1d::fit_lp(const Eigen::VectorXd& x, const Eigen::VectorXd& weights) +Kde1d::fit_lp(const Eigen::VectorXd& x, + const Eigen::VectorXd& grid_points, + const Eigen::VectorXd& weights) { - fft::KdeFFT kde_fft(x, bw_, weights); + size_t m = grid_points.size(); + fft::KdeFFT kde_fft(x, bw_, grid_points(0), grid_points(m - 1), weights); Eigen::VectorXd f0 = kde_fft.kde_drv(0); - Eigen::VectorXd wbin = Eigen::VectorXd::Ones(f0.size()); + Eigen::VectorXd wbin = Eigen::VectorXd::Ones(m); if (weights.size()) { // compute the average weight per cell auto wcount = kde_fft.get_bin_counts(); auto count = tools::linbin(x, - x.minCoeff() - 4 * bw_, - x.maxCoeff() + 4 * bw_, - f0.size() - 1, + grid_points(0), + grid_points(m - 1), + m - 1, wbin); wbin = wcount.cwiseQuotient(count); } @@ -375,7 +379,7 @@ Kde1d::fit_lp(const Eigen::VectorXd& x, const Eigen::VectorXd& weights) } res.col(0) = res.col(0).array() * (-0.5 * b.array().pow(2) * S.array()).exp(); - for (size_t k = 0; k < f0.size(); k++) { + for (size_t k = 0; k < m; k++) { // TODO: weights res(k, 1) = calculate_infl(x.size(), f0(k), b(k), bw_, S(k), wbin(k)); if (std::isnan(res(k, 0))) @@ -441,10 +445,10 @@ Kde1d::boundary_transform(const Eigen::VectorXd& x, bool inverse) x_new = stats::qnorm(x_new); } else if (!std::isnan(xmin_)) { // left boundary -> log transform - x_new = (1e-3 + x.array() - xmin_).log(); + x_new = (1e-5 + x.array() - xmin_).log(); } else if (!std::isnan(xmax_)) { // right boundary -> negative log transform - x_new = (1e-3 + xmax_ - x.array()).log(); + x_new = (1e-5 + xmax_ - x.array()).log(); } else { // no boundary -> no transform } @@ -453,13 +457,13 @@ Kde1d::boundary_transform(const Eigen::VectorXd& x, bool inverse) // two boundaries -> probit transform auto rng = xmax_ - xmin_; x_new = stats::pnorm(x).array() + xmin_ - 5e-5 * rng; - x_new *= (xmax_ - xmin_ + 1e-4 * rng); + x_new *= (xmax_ - xmin_ + 1e-4 * rng); } else if (!std::isnan(xmin_)) { // left boundary -> log transform - x_new = x.array().exp() + xmin_ - 1e-3; + x_new = x.array().exp() + xmin_ - 1e-5; } else if (!std::isnan(xmax_)) { // right boundary -> negative log transform - x_new = -(x.array().exp() - xmax_ - 1e-3); + x_new = -(x.array().exp() - xmax_ - 1e-5); } else { // no boundary -> no transform } @@ -479,16 +483,17 @@ Kde1d::boundary_correct(const Eigen::VectorXd& x, const Eigen::VectorXd& fhat) Eigen::VectorXd corr_term(fhat.size()); if (!std::isnan(xmin_) & !std::isnan(xmax_)) { // two boundaries -> probit transform - corr_term = (x.array() - xmin_ + 5e-5) / (xmax_ - xmin_ + 1e-4); + auto rng = xmax_ - xmin_; + corr_term = (x.array() - xmin_ + 5e-5 * rng) / (xmax_ - xmin_ + 1e-4 * rng); corr_term = stats::dnorm(stats::qnorm(corr_term)); - corr_term /= (xmax_ - xmin_ + 1e-4); + corr_term /= (xmax_ - xmin_ + 1e-4 * rng); corr_term = 1.0 / corr_term.array().max(1e-6); } else if (!std::isnan(xmin_)) { // left boundary -> log transform - corr_term = 1.0 / (1e-3 + x.array() - xmin_); + corr_term = 1.0 / (1e-5 + x.array() - xmin_).max(1e-6); } else if (!std::isnan(xmax_)) { // right boundary -> negative log transform - corr_term = 1.0 / (1e-3 + xmax_ - x.array()); + corr_term = 1.0 / (1e-5 + xmax_ - x.array()).max(1e-6); } else { // no boundary -> no transform corr_term.fill(1.0); @@ -497,19 +502,22 @@ Kde1d::boundary_correct(const Eigen::VectorXd& x, const Eigen::VectorXd& fhat) return fhat.array() * corr_term.array(); } -//! constructs a grid that is later used for interpolation. +//! constructs a grid later used for interpolation //! @param x vector of observations. //! @return a grid of size 50. inline Eigen::VectorXd Kde1d::construct_grid_points(const Eigen::VectorXd& x) { - Eigen::VectorXd xx(2); - xx << x.minCoeff(), x.maxCoeff(); - Eigen::VectorXd rng = boundary_transform(xx); - rng(0) -= 4 * bw_; - rng(1) += 4 * bw_; - auto gr = Eigen::VectorXd::LinSpaced(401, rng(0), rng(1)); - return boundary_transform(gr, true); + Eigen::VectorXd rng(2); + rng << x.minCoeff(), x.maxCoeff(); + if (std::isnan(xmin_) && std::isnan(xmax_)) + rng(0) -= 4 * bw_; + if (std::isnan(xmax_)) + rng(1) += 4 * bw_; + if (std::isnan(xmin_) && !std::isnan(xmax_)) + std::swap(rng(0), rng(1)); + auto zgrid = Eigen::VectorXd::LinSpaced(401, rng(0), rng(1)); + return boundary_transform(zgrid, true); } //! moves the boundary points of the grid to xmin/xmax (if non-NaN). diff --git a/inst/include/kde1d/kdefft.hpp b/inst/include/kde1d/kdefft.hpp index da44c33..ca712e0 100644 --- a/inst/include/kde1d/kdefft.hpp +++ b/inst/include/kde1d/kdefft.hpp @@ -17,6 +17,8 @@ class KdeFFT public: KdeFFT(const Eigen::VectorXd& x, double bw, + double lower, + double upper, const Eigen::VectorXd& weights = Eigen::VectorXd()); Eigen::VectorXd kde_drv(size_t drv) const; @@ -32,11 +34,18 @@ class KdeFFT }; //! @param x vector of observations. +//! @param bw the bandwidth parameter. +//! @param lower lower bound of the grid. +//! @param upper bound of the grid. //! @param weigths optional vector of weights for each observation. inline KdeFFT::KdeFFT(const Eigen::VectorXd& x, double bw, + double lower, + double upper, const Eigen::VectorXd& weights) : bw_(bw) + , lower_(lower) + , upper_(upper) { if (weights.size() > 0 && (weights.size() != x.size())) throw std::runtime_error("x and weights must have the same size"); @@ -47,9 +56,6 @@ inline KdeFFT::KdeFFT(const Eigen::VectorXd& x, } else { w = Eigen::VectorXd::Ones(x.size()); } - - lower_ = x.minCoeff() - 4 * bw; - upper_ = x.maxCoeff() + 4 * bw; bin_counts_ = tools::linbin(x, lower_, upper_, num_bins_, w); } From 319056a642172b1bf3f8ff67d063d427fc598c13 Mon Sep 17 00:00:00 2001 From: tnagler Date: Thu, 14 Nov 2019 22:02:42 +0100 Subject: [PATCH 5/7] eliminate dead code --- R/tools.R | 14 -------------- inst/include/kde1d/kde1d.hpp | 19 ------------------- inst/include/kde1d/tools.hpp | 6 ------ tests/testthat/test_kde1d.R | 3 +-- 4 files changed, 1 insertion(+), 41 deletions(-) diff --git a/R/tools.R b/R/tools.R index c0d8061..c9c27fb 100644 --- a/R/tools.R +++ b/R/tools.R @@ -58,20 +58,6 @@ check_arguments <- function(x, mult, xmin, xmax, bw, deg, weights) { } } -#' adjusts observations and evaluation points for boundary effects -#' @importFrom stats qnorm -#' @noRd -boundary_transform <- function(x, xmin, xmax) { - if (!is.nan(xmin) & !is.nan(xmax)) { # two boundaries - x <- qnorm((x - xmin) / (xmax - xmin + 1e-1)) - } else if (!is.nan(xmin)) { # left boundary - x <- log(x - xmin + 1e-3) - } else if (!is.nan(xmax)) { # right boundary - x <- log(xmax - x + 1e-3) - } - - x -} #' prepares evaluation points observations and evaluation points for boundary effects #' @importFrom stats qnorm diff --git a/inst/include/kde1d/kde1d.hpp b/inst/include/kde1d/kde1d.hpp index db4fb99..fbfb350 100644 --- a/inst/include/kde1d/kde1d.hpp +++ b/inst/include/kde1d/kde1d.hpp @@ -82,7 +82,6 @@ class Kde1d const Eigen::VectorXd& fhat); Eigen::VectorXd construct_grid_points(const Eigen::VectorXd& x); Eigen::VectorXd finalize_grid(Eigen::VectorXd& grid_points); - Eigen::VectorXd without_boundary_ext(const Eigen::VectorXd& grid_points); double select_bw(const Eigen::VectorXd& x, double bw, double mult, @@ -533,24 +532,6 @@ Kde1d::finalize_grid(Eigen::VectorXd& grid_points) return grid_points; } -//! removes the boundary extension from the grid_points (see -//! `construct_grid_points`). -//! @param grid_points the grid points. -inline Eigen::VectorXd -Kde1d::without_boundary_ext(const Eigen::VectorXd& grid_points) -{ - size_t grid_start = 0; - size_t grid_size = grid_points.size(); - // (grid extension has length 2) - if (std::isnan(xmin_)) { - grid_start += 1; - grid_size -= 2; - } - if (std::isnan(xmax_)) - grid_size -= 2; - - return grid_points.segment(grid_start, grid_size); -} // Bandwidth for Kernel Density Estimation //' @param x vector of observations diff --git a/inst/include/kde1d/tools.hpp b/inst/include/kde1d/tools.hpp index 768770b..d1d441f 100644 --- a/inst/include/kde1d/tools.hpp +++ b/inst/include/kde1d/tools.hpp @@ -51,12 +51,6 @@ inline Eigen::VectorXd invert_f( return x_tmp; } -//! finds the index, where the minimum in a vector occurs. -//! @param x the vector. -inline size_t find_min_index(const Eigen::VectorXd& x) -{ - return std::min_element(x.data(), x.data() + x.size()) - x.data(); -} //! remove rows of a matrix which contain nan values or have zero weight //! @param x the matrix. diff --git a/tests/testthat/test_kde1d.R b/tests/testthat/test_kde1d.R index 27c01b3..9ab55b8 100644 --- a/tests/testthat/test_kde1d.R +++ b/tests/testthat/test_kde1d.R @@ -1,7 +1,6 @@ context("Testing 'kde1d'") -set.seed(0) -n_sim <- 20 +n_sim <- 100 data_types <- c( "unbounded", "left_boundary", "right_boundary", "two_boundaries", "discrete" From 861c96e3c1f28ee6339f4c979a5e6b43399cd45d Mon Sep 17 00:00:00 2001 From: tnagler Date: Thu, 14 Nov 2019 23:06:56 +0100 Subject: [PATCH 6/7] safer integration --- inst/include/kde1d/interpolation.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/include/kde1d/interpolation.hpp b/inst/include/kde1d/interpolation.hpp index 7809f69..564acc2 100644 --- a/inst/include/kde1d/interpolation.hpp +++ b/inst/include/kde1d/interpolation.hpp @@ -169,7 +169,7 @@ inline Eigen::VectorXd InterpolationGrid1d::integrate(const Eigen::VectorXd& x, cum_int += cubic_integral(0.0, 1.0, tmp_coefs) * tmp_eps; k++; } - return res / cum_int; + return (res / cum_int).cwiseMin(1.0).cwiseMax(0.0); } // ---------------- Utility functions for spline interpolation ---------------- From aa2f016354e22627f657118102031e01199b3ba8 Mon Sep 17 00:00:00 2001 From: tnagler Date: Thu, 14 Nov 2019 23:47:39 +0100 Subject: [PATCH 7/7] reorder arguments for the feels --- R/kde1d.R | 6 +++--- tests/testthat/test_kde1d.R | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/kde1d.R b/R/kde1d.R index 7922509..9715cdc 100644 --- a/R/kde1d.R +++ b/R/kde1d.R @@ -115,10 +115,10 @@ kde1d <- function(x, xmin = NaN, xmax = NaN, mult = 1, bw = NA, deg = 2, weights = weights) # add info - fit$var_name <- as.character(match.call()[2]) - fit$nobs <- sum(!is.na(x)) - fit$weights <- weights fit$x <- x + fit$weights <- weights + fit$nobs <- sum(!is.na(x)) + fit$var_name <- as.character(match.call()[2]) fit } diff --git a/tests/testthat/test_kde1d.R b/tests/testthat/test_kde1d.R index 9ab55b8..2880d90 100644 --- a/tests/testthat/test_kde1d.R +++ b/tests/testthat/test_kde1d.R @@ -59,7 +59,7 @@ test_that("returns proper 'kde1d' object", { class_members <- c( "grid_points", "values", "nlevels", "bw", "xmin", "xmax", "deg", - "edf", "loglik", "var_name", "nobs", "weights", "x" + "edf", "loglik", "x", "weights", "nobs", "var_name" ) lapply(fits, function(x) expect_identical(names(x), class_members)) })