Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fit models with the fast fourier transform #43

Merged
merged 7 commits into from
Nov 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
.Ruserdata
revdep
/revdep/.cache.rds
.vscode
6 changes: 3 additions & 3 deletions R/kde1d.R
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,10 @@ kde1d <- function(x, xmin = NaN, xmax = NaN, mult = 1, bw = NA, deg = 2,
weights = weights)

# add info
fit$var_name <- as.character(match.call()[2])
fit$nobs <- sum(!is.na(x))
fit$weights <- weights
fit$x <- x
fit$weights <- weights
fit$nobs <- sum(!is.na(x))
fit$var_name <- as.character(match.call()[2])

fit
}
14 changes: 0 additions & 14 deletions R/tools.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,20 +58,6 @@ check_arguments <- function(x, mult, xmin, xmax, bw, deg, weights) {
}
}

#' adjusts observations and evaluation points for boundary effects
#' @importFrom stats qnorm
#' @noRd
boundary_transform <- function(x, xmin, xmax) {
if (!is.nan(xmin) & !is.nan(xmax)) { # two boundaries
x <- qnorm((x - xmin) / (xmax - xmin + 1e-1))
} else if (!is.nan(xmin)) { # left boundary
x <- log(x - xmin + 1e-3)
} else if (!is.nan(xmax)) { # right boundary
x <- log(xmax - x + 1e-3)
}

x
}

#' prepares evaluation points observations and evaluation points for boundary effects
#' @importFrom stats qnorm
Expand Down
168 changes: 48 additions & 120 deletions inst/include/kde1d/dpik.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#pragma once

#include "kdefft.hpp"
#include "stats.hpp"
#include <unsupported/Eigen/FFT>
#define _USE_MATH_DEFINES
#include <cmath>

Expand All @@ -13,132 +13,67 @@ namespace bw {
//! Methodology is similar to Sheather and Jones(1991), but asymptotic
//! bias/variance expressions are adapted for higher-order polynomials and
//! nearest neighbor bandwidths.
class PluginBandwidthSelector {
class PluginBandwidthSelector
{
public:
PluginBandwidthSelector(const Eigen::VectorXd& x,
const Eigen::VectorXd& weights = Eigen::VectorXd());
double select_bw(size_t deg);

private:
Eigen::VectorXd linbin(const Eigen::VectorXd& x,
const Eigen::VectorXd& weights);
double scale_est(const Eigen::VectorXd& x);
Eigen::VectorXd kde_drv(size_t drv);
void set_bw_for_bkfe(size_t drv);
double bkfe(size_t drv);
double get_bw_for_bkfe(size_t drv);
double ll_ibias2(size_t deg);
double ll_ivar(size_t deg);

size_t num_bins_{ 401 };
double bw_ { NAN };
double lower_;
double upper_;
fft::KdeFFT kde_;
Eigen::VectorXd weights_;
Eigen::VectorXd bin_counts_;
double scale_;
};


//! @param x vector of observations.
//! @param weigths optional vector of weights for each observation.
PluginBandwidthSelector::PluginBandwidthSelector(const Eigen::VectorXd& x,
const Eigen::VectorXd& weights)
: lower_(x.minCoeff())
, upper_(x.maxCoeff())
, weights_(weights)
inline PluginBandwidthSelector::PluginBandwidthSelector(
const Eigen::VectorXd& x,
const Eigen::VectorXd& weights)
: weights_(weights)
, kde_(fft::KdeFFT(x, 0.0, x.minCoeff(), x.maxCoeff(), weights))
{
if (weights.size() > 0 && (weights.size() != x.size()))
throw std::runtime_error("x and weights must have the same size");
if (weights.size() == 0) {
weights_ = Eigen::VectorXd::Ones(x.size());
} else {
weights_ = weights_ * x.size() / weights_.sum();
}

bin_counts_ = linbin(x, weights_);
bin_counts_ = kde_.get_bin_counts();
scale_ = scale_est(x);
}

//! Computes bin counts for univariate data via the linear binning strategy.
//! @param x vector of observations
//! @param weights vector of weights for each observation.
inline Eigen::VectorXd PluginBandwidthSelector::linbin(
const Eigen::VectorXd& x, const Eigen::VectorXd& weights)
{
Eigen::VectorXd gcnts = Eigen::VectorXd::Zero(num_bins_);
double rem, lxi, delta;

delta = (upper_ - lower_) / (num_bins_ - 1.0);
for (size_t i = 0; i < x.size(); ++i) {
lxi = (x(i) - lower_) / delta + 1.0;
size_t li = static_cast<size_t>(lxi);
rem = lxi - li;
if (li >= 1 && li < num_bins_) {
gcnts(li - 1) += (1 - rem) * weights(i);
gcnts(li) += rem * weights(i);
}
}

return gcnts;
}

//! Scale estimate (minimum of standard deviation and robust equivalent)
//! @param x vector of observations.
double PluginBandwidthSelector::scale_est(const Eigen::VectorXd& x)
inline double
PluginBandwidthSelector::scale_est(const Eigen::VectorXd& x)
{
double m_x = x.cwiseProduct(weights_).mean();
Eigen::VectorXd sx = (x - Eigen::VectorXd::Constant(x.size(), m_x));
double sd_x = std::sqrt(
sx.cwiseAbs2().cwiseProduct(weights_).sum()/(x.size() - 1));
double sd_x =
std::sqrt(sx.cwiseAbs2().cwiseProduct(weights_).sum() / (x.size() - 1));
Eigen::VectorXd q_x(2);
q_x << 0.25, 0.75;
q_x = stats::quantile(x, q_x, weights_);
double scale = std::min((q_x(1) - q_x(0))/1.349, sd_x);
double scale = std::min((q_x(1) - q_x(0)) / 1.349, sd_x);
if (scale == 0) {
scale = (sd_x > 0) ? sd_x : 1.0;
}
return scale;
}


//! Binned kernel density derivative estimate
//! @param drv order of derivative.
//! @return estimated derivative evaluated at the bin centers.
Eigen::VectorXd PluginBandwidthSelector::kde_drv(size_t drv)
{
double delta = (upper_ - lower_) / (num_bins_ - 1.0);
double tau = 4.0 + drv;
size_t L = std::floor(tau * bw_ / delta);
L = std::min(L, num_bins_);

double tmp_dbl = L * delta / bw_;
Eigen::VectorXd arg = Eigen::VectorXd::LinSpaced(L + 1, 0.0, tmp_dbl);
tmp_dbl = std::pow(bw_, drv + 1.0);
arg = stats::dnorm_drv(arg, drv) / (tmp_dbl * bin_counts_.sum());

tmp_dbl = num_bins_ + L + 1.0;
tmp_dbl = std::pow(2, std::ceil(std::log(tmp_dbl) / std::log(2)));
size_t P = static_cast<size_t>(tmp_dbl);

Eigen::VectorXd arg2 = Eigen::VectorXd::Zero(P);
arg2.head(L + 1) = arg;
arg2.tail(L) = arg.tail(L).reverse() * (drv % 2 ? -1.0 : 1.0);

Eigen::VectorXd x2 = Eigen::VectorXd::Zero(P);
x2.head(num_bins_) = bin_counts_;

Eigen::FFT<double> fft;
Eigen::VectorXcd tmp1 = fft.fwd(arg2);
Eigen::VectorXcd tmp2 = fft.fwd(x2);
tmp1 = tmp1.cwiseProduct(tmp2);
tmp2 = fft.inv(tmp1);
return tmp2.head(num_bins_).real();
}

//! optimal bandwidths for kernel functionals (see Wand and Jones' book, 3.5)
//! only works for even drv
//! @param drv order of the derivative in the kernel functional.
void PluginBandwidthSelector::set_bw_for_bkfe(size_t drv)
inline double
PluginBandwidthSelector::get_bw_for_bkfe(size_t drv)
{
if (drv % 2 != 0) {
throw std::runtime_error("only even drv allowed.");
Expand All @@ -149,57 +84,49 @@ void PluginBandwidthSelector::set_bw_for_bkfe(size_t drv)

// start with normal reference rule (eq 3.7)
int r = drv + 4;
double psi =((r/2) % 2 == 0) ? 1 : -1;
double psi = ((r / 2) % 2 == 0) ? 1 : -1;
psi *= std::tgamma(r + 1);
psi /= std::pow(2 * scale_, r + 1) * std::tgamma(r/2 + 1) * std::sqrt(M_PI);
psi /= std::pow(2 * scale_, r + 1) * std::tgamma(r / 2 + 1) * std::sqrt(M_PI);
double Kr = stats::dnorm_drv(Eigen::VectorXd::Zero(1), r - 2)(0);
bw_ = std::pow(-2 * Kr / (psi * n), 1.0 / (r + 1));
kde_.set_bw(std::pow(-2 * Kr / (psi * n), 1.0 / (r + 1)));

// now use plug in to select the actual bandwidth (eq. 3.6)
r -= 2;
psi = bkfe(drv + 2);
Kr = stats::dnorm_drv(Eigen::VectorXd::Zero(1), r - 2)(0);

bw_ = std::pow(-2 * Kr / (psi * n), 1.0 / (r + 1));
}
// that's bkfe()
psi = bin_counts_.cwiseProduct(kde_.kde_drv(drv + 2)).sum();
psi /= bin_counts_.sum();

Kr = stats::dnorm_drv(Eigen::VectorXd::Zero(1), r - 2)(0);

//! Binned Kernel Functional Estimate
//! @param x vector of bin counts
//! @param drv order of derivative in the density functional
//! @param h kernel bandwidth
//! @param a minimum value of x at which to compute the estimate
//! @param b maximum value of x at which to compute the estimate
//! @return the estimated functional
double PluginBandwidthSelector::bkfe(size_t drv)
{
return bin_counts_.cwiseProduct(kde_drv(drv)).sum() / bin_counts_.sum();
return std::pow(-2 * Kr / (psi * n), 1.0 / (r + 1));
}

//! computes the integrated squared bias (without bw and n terms).
//! Bias expressions can be found in Geenens (JASA, 2014)
//! @param deg degree of the local polynomial.
double PluginBandwidthSelector::ll_ibias2(size_t deg)
inline double
PluginBandwidthSelector::ll_ibias2(size_t deg)
{
Eigen::VectorXd arg;
if (deg == 0) {
set_bw_for_bkfe(4);
arg = 0.25 * kde_drv(4);
kde_.set_bw(get_bw_for_bkfe(4));
arg = 0.25 * kde_.kde_drv(4);
} else if (deg == 1) {
set_bw_for_bkfe(4);
Eigen::VectorXd f0 = kde_drv(0);
Eigen::VectorXd f1 = kde_drv(1);
Eigen::VectorXd f2 = kde_drv(2);
kde_.set_bw(get_bw_for_bkfe(4));
Eigen::VectorXd f0 = kde_.kde_drv(0);
Eigen::VectorXd f1 = kde_.kde_drv(1);
Eigen::VectorXd f2 = kde_.kde_drv(2);
arg = (0.5 * f2 + f1.cwiseAbs2().cwiseQuotient(f0))
.cwiseAbs2().cwiseQuotient(f0);
.cwiseAbs2()
.cwiseQuotient(f0);
} else if (deg == 2) {
set_bw_for_bkfe(8);
Eigen::VectorXd f0 = kde_drv(0);
Eigen::VectorXd f1 = kde_drv(1);
Eigen::VectorXd f2 = kde_drv(2);
Eigen::VectorXd f4 = kde_drv(4);
kde_.set_bw(get_bw_for_bkfe(8));
Eigen::VectorXd f0 = kde_.kde_drv(0);
Eigen::VectorXd f1 = kde_.kde_drv(1);
Eigen::VectorXd f2 = kde_.kde_drv(2);
Eigen::VectorXd f4 = kde_.kde_drv(4);
arg = f4 - 3 * f2.cwiseAbs2().cwiseQuotient(f0) +
2 * (f1.array().pow(4) / f0.array().pow(3)).matrix();
2 * (f1.array().pow(4) / f0.array().pow(3)).matrix();
arg = (0.125 * arg).cwiseAbs2().cwiseQuotient(f0);
} else {
throw std::runtime_error("deg must be one of {0, 1, 2}.");
Expand All @@ -209,8 +136,9 @@ double PluginBandwidthSelector::ll_ibias2(size_t deg)

//! computes the integrated squared variance (without bw and n terms).
//! Variance expressions can be found in Geenens (JASA, 2014)
//! @param deg degree of the local polynomial.
double PluginBandwidthSelector::ll_ivar(size_t deg)
//! @param deg degree of the local polynomial.
inline double
PluginBandwidthSelector::ll_ivar(size_t deg)
{
if (deg > 2)
throw std::runtime_error("deg must be one of {0, 1, 2}.");
Expand All @@ -219,15 +147,16 @@ double PluginBandwidthSelector::ll_ivar(size_t deg)

//! Selects the bandwidth for kernel density estimation.
//! @param deg degree of the local polynomial.
inline double PluginBandwidthSelector::select_bw(size_t deg)
inline double
PluginBandwidthSelector::select_bw(size_t deg)
{
// effective sample size
double n = std::pow(weights_.sum(), 2) / weights_.cwiseAbs2().sum();
double bw;
int bwpow = (deg < 2 ? 4 : 8);
try {
double ibias2 = ll_ibias2(deg);
double ivar = ll_ivar(deg);
double ivar = ll_ivar(deg);
bw = std::pow(ivar / (bwpow * n * ibias2), 1.0 / (bwpow + 1));
} catch (...) {
bw = 4.0 * 1.06 * scale_ * std::pow(n, -1.0 / (bwpow + 1));
Expand All @@ -239,4 +168,3 @@ inline double PluginBandwidthSelector::select_bw(size_t deg)
} // end kde1d::bw

} // end kde1d

2 changes: 1 addition & 1 deletion inst/include/kde1d/interpolation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ inline Eigen::VectorXd InterpolationGrid1d::integrate(const Eigen::VectorXd& x,
cum_int += cubic_integral(0.0, 1.0, tmp_coefs) * tmp_eps;
k++;
}
return res / cum_int;
return (res / cum_int).cwiseMin(1.0).cwiseMax(0.0);
}

// ---------------- Utility functions for spline interpolation ----------------
Expand Down
Loading