diff --git a/R/RcppExports.R b/R/RcppExports.R index 06c21c0..0805f48 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -3,7 +3,9 @@ #' fits a kernel density estimate and calculates the effective degrees of #' freedom. -#' @param x vector of observations. +#' @param x vector of observations; catergorical data must be converted to +#' non-negative integers. +#' @param nlevels the number of factor levels; 0 for continuous data. #' @param bw the bandwidth parameter. #' @param xmin lower bound for the support of the density, `NaN` means no #' boundary. @@ -13,47 +15,35 @@ #' @return `An Rcpp::List` containing the fitted density values on a grid and #' additional information. #' @noRd -fit_kde1d_cpp <- function(x, bw, xmin, xmax, deg, weights) { - .Call('_kde1d_fit_kde1d_cpp', PACKAGE = 'kde1d', x, bw, xmin, xmax, deg, weights) +fit_kde1d_cpp <- function(x, nlevels, bw, mult, xmin, xmax, deg, weights) { + .Call('_kde1d_fit_kde1d_cpp', PACKAGE = 'kde1d', x, nlevels, bw, mult, xmin, xmax, deg, weights) } #' computes the pdf of a kernel density estimate by interpolation. #' @param x vector of evaluation points. -#' @param R_object the fitted object passed from R. +#' @param kde1d_r the fitted object passed from R. #' @return a vector of pdf values. #' @noRd -dkde1d_cpp <- function(x, R_object) { - .Call('_kde1d_dkde1d_cpp', PACKAGE = 'kde1d', x, R_object) +dkde1d_cpp <- function(x, kde1d_r) { + .Call('_kde1d_dkde1d_cpp', PACKAGE = 'kde1d', x, kde1d_r) } #' computes the cdf of a kernel density estimate by numerical integration. #' @param x vector of evaluation points. -#' @param R_object the fitted object passed from R. +#' @param kde1d_r the fitted object passed from R. #' @return a vector of cdf values. #' @noRd -pkde1d_cpp <- function(x, R_object) { - .Call('_kde1d_pkde1d_cpp', PACKAGE = 'kde1d', x, R_object) +pkde1d_cpp <- function(q, kde1d_r) { + .Call('_kde1d_pkde1d_cpp', PACKAGE = 'kde1d', q, kde1d_r) } #' computes the quantile of a kernel density estimate by numerical inversion #' (bisection method). #' @param x vector of evaluation points. -#' @param R_object the fitted object passed from R. +#' @param kde1d_r the fitted object passed from R. #' @return a vector of quantiles. #' @noRd -qkde1d_cpp <- function(x, R_object) { - .Call('_kde1d_qkde1d_cpp', PACKAGE = 'kde1d', x, R_object) -} - -#' @param x vector of observations -#' @param bw bandwidth parameter, NA for automatic selection. -#' @param mult bandwidth multiplier. -#' @param discrete whether a jittered estimate is computed. -#' @param weights vector of weights for each observation (can be empty). -#' @param deg polynomial degree. -#' @return the selected bandwidth -#' @noRd -select_bw_cpp <- function(x, bw, mult, discrete, weights, deg) { - .Call('_kde1d_select_bw_cpp', PACKAGE = 'kde1d', x, bw, mult, discrete, weights, deg) +qkde1d_cpp <- function(p, kde1d_r) { + .Call('_kde1d_qkde1d_cpp', PACKAGE = 'kde1d', p, kde1d_r) } diff --git a/R/kde1d_methods.R b/R/kde1d-methods.R similarity index 81% rename from R/kde1d_methods.R rename to R/kde1d-methods.R index 0fbafcf..4e5c330 100644 --- a/R/kde1d_methods.R +++ b/R/kde1d-methods.R @@ -32,22 +32,7 @@ #' @export dkde1d <- function(x, obj) { x <- prep_eval_arg(x, obj) - - # adjust grid to stabilize estimates - rng <- diff(range(obj$grid_points)) - if (!is.nan(obj$xmin)) - obj$grid_points[1] <- obj$xmin - 0.1 * rng - if (!is.nan(obj$xmax)) - obj$grid_points[length(obj$grid_points)] <- obj$xmax + 0.1 * rng - - fhat <- dkde1d_cpp(as.numeric(x), obj) - if (is.ordered(obj$x)) { - # for discrete variables we can normalize - f_all <- dkde1d_cpp(seq_along(levels(obj$x)), obj) - fhat <- fhat / sum(f_all) - } - - as.vector(fhat) + dkde1d_cpp(as.numeric(x), obj) } #' @param q vector of quantiles. @@ -55,39 +40,18 @@ dkde1d <- function(x, obj) { #' @export pkde1d <- function(q, obj) { q <- prep_eval_arg(q, obj) - - if (is.numeric(obj$x)) { - p <- pkde1d_cpp(q, obj) - } else { - if (!is.ordered(q)) { - q <- ordered(q, levels(obj$x)) - } - x_all <- ordered(levels(obj$x), levels(obj$x)) - p_all <- dkde1d(x_all, obj) - p_total <- sum(p_all) - p <- sapply(q, function(y) sum(p_all[x_all <= y] / p_total)) - p <- pmin(pmax(p, 0), 1) - } - - p + pkde1d_cpp(q, obj) } #' @param p vector of probabilities. #' @rdname dkde1d #' @export qkde1d <- function(p, obj) { - stopifnot(all(na.omit(p) > 0.0) & all(na.omit(p) < 1.0)) - if (is.numeric(obj$x)) { - q <- qkde1d_cpp(p, obj) - } else { - ## for discrete variables compute quantile from the density - x_all <- ordered(levels(obj$x), levels(obj$x)) - # pdf at all possible values of x - pp <- pkde1d(x_all, obj) - # generalized inverse - q <- x_all[vapply(p, function(y) which(y <= pp)[1], integer(1))] + q <- qkde1d_cpp(p, obj) + if (is.ordered(obj$x)) { + ## for discrete variables, add factor levels + q <- ordered(levels(obj$x)[q + 1], levels(obj$x)) } - q } diff --git a/R/kde1d.R b/R/kde1d.R index 2bf88fa..7922509 100644 --- a/R/kde1d.R +++ b/R/kde1d.R @@ -98,36 +98,27 @@ #' lines(kde1d(x), col = 2) #' @importFrom stats na.omit #' @export -kde1d <- function(x, xmin = NaN, xmax = NaN, mult = 1, bw = NA, - deg = 2, weights = numeric(0)) { +kde1d <- function(x, xmin = NaN, xmax = NaN, mult = 1, bw = NA, deg = 2, + weights = numeric(0)) { x <- na.omit(x) # sanity checks check_arguments(x, mult, xmin, xmax, bw, deg, weights) - w_norm <- weights / mean(weights) - - # equidistant jittering for discrete variables - x_jtr <- equi_jitter(x) - - # bandwidth selection - bw <- select_bw_cpp( - boundary_transform(na.omit(x_jtr), xmin, xmax), - bw, - mult, - is.ordered(x), - w_norm, - deg - ) # fit model - fit <- fit_kde1d_cpp(na.omit(x_jtr), bw, xmin, xmax, deg, w_norm) + fit <- fit_kde1d_cpp(x = if (is.numeric(x)) x else (as.numeric(x) - 1), + nlevels = length(levels(x)), + bw = bw, + mult = mult, + xmin = xmin, + xmax = xmax, + deg = deg, + weights = weights) # add info fit$var_name <- as.character(match.call()[2]) - fit$nobs <- length(x) + fit$nobs <- sum(!is.na(x)) fit$weights <- weights fit$x <- x - # return as kde1d object - class(fit) <- "kde1d" fit } diff --git a/R/tools.R b/R/tools.R index 1cf1da5..c0d8061 100644 --- a/R/tools.R +++ b/R/tools.R @@ -80,15 +80,15 @@ prep_eval_arg <- function(x, obj) { if (is.data.frame(x)) x <- x[[1]] if (!is.ordered(x) & is.ordered(obj$x)) - x <- ordered(x, levels(obj$x)) + x <- as.ordered(x) if (is.numeric(x)) return(x) stopifnot(is.ordered(x)) if (!all(levels(x) %in% levels(obj$x))) - stop("'x' contains levels that were not observed in the data.") + stop("'x' contains levels that weren't present when fitting.") levels(x) <- levels(obj$x) if (!is.ordered(x) & is.ordered(obj$x)) x <- ordered(x, levels(obj$x)) - x + as.numeric(x) - 1 } diff --git a/inst/include/kde1d-wrappers.hpp b/inst/include/kde1d-wrappers.hpp new file mode 100644 index 0000000..253b1ff --- /dev/null +++ b/inst/include/kde1d-wrappers.hpp @@ -0,0 +1,33 @@ +#pragma once + +#include "kde1d/kde1d.hpp" + +namespace kde1d { + +inline Rcpp::List kde1d_wrap(const Kde1d& kde1d_cpp) +{ + auto kde1d_r = Rcpp::List::create( + Rcpp::Named("grid_points") = kde1d_cpp.get_grid_points(), + Rcpp::Named("values") = kde1d_cpp.get_values(), + Rcpp::Named("nlevels") = kde1d_cpp.get_nlevels(), + Rcpp::Named("bw") = kde1d_cpp.get_bw(), + Rcpp::Named("xmin") = kde1d_cpp.get_xmin(), + Rcpp::Named("xmax") = kde1d_cpp.get_xmax(), + Rcpp::Named("deg") = kde1d_cpp.get_deg(), + Rcpp::Named("edf") = kde1d_cpp.get_edf(), + Rcpp::Named("loglik") = kde1d_cpp.get_loglik() + ); + kde1d_r.attr("class") = "kde1d"; + + return kde1d_r; +} + +inline Kde1d kde1d_wrap(const Rcpp::List& kde1d_r) +{ + auto grid = interp::InterpolationGrid1d( + kde1d_r["grid_points"], kde1d_r["values"], 0); + return Kde1d(grid, kde1d_r["nlevels"], kde1d_r["xmin"], kde1d_r["xmax"]); +} + + +} \ No newline at end of file diff --git a/inst/include/dpik.hpp b/inst/include/kde1d/dpik.hpp similarity index 99% rename from inst/include/dpik.hpp rename to inst/include/kde1d/dpik.hpp index 847738e..cd8156f 100644 --- a/inst/include/dpik.hpp +++ b/inst/include/kde1d/dpik.hpp @@ -5,6 +5,10 @@ #define _USE_MATH_DEFINES #include +namespace kde1d { + +namespace bw { + //! Bandwidth selection for local-likelihood density estimation. //! Methodology is similar to Sheather and Jones(1991), but asymptotic //! bias/variance expressions are adapted for higher-order polynomials and @@ -232,3 +236,7 @@ inline double PluginBandwidthSelector::select_bw(size_t deg) return bw; } +} // end kde1d::bw + +} // end kde1d + diff --git a/inst/include/interpolation.hpp b/inst/include/kde1d/interpolation.hpp similarity index 90% rename from inst/include/interpolation.hpp rename to inst/include/kde1d/interpolation.hpp index 5f7c90b..5846361 100644 --- a/inst/include/interpolation.hpp +++ b/inst/include/kde1d/interpolation.hpp @@ -4,6 +4,10 @@ #include #include "tools.hpp" +namespace kde1d { + +namespace interp { + //! A class for cubic spline interpolation in one dimension //! //! The class is used for implementing kernel estimators. It makes storing the @@ -19,9 +23,9 @@ class InterpolationGrid1d void normalize(int times); - Eigen::VectorXd interpolate(const Eigen::VectorXd& x); + Eigen::VectorXd interpolate(const Eigen::VectorXd& x) const; - Eigen::VectorXd integrate(const Eigen::VectorXd& u); + Eigen::VectorXd integrate(const Eigen::VectorXd& u) const; Eigen::VectorXd get_values() const {return values_;} Eigen::VectorXd get_grid_points() const {return grid_points_;} @@ -29,28 +33,28 @@ class InterpolationGrid1d private: // Utility functions for spline Interpolation double cubic_poly(const double& x, - const Eigen::VectorXd& a); + const Eigen::VectorXd& a) const; double cubic_indef_integral(const double& x, - const Eigen::VectorXd& a); + const Eigen::VectorXd& a) const; double cubic_integral(const double& lower, const double& upper, - const Eigen::VectorXd& a); + const Eigen::VectorXd& a) const; Eigen::VectorXd find_coefs(const Eigen::VectorXd& vals, - const Eigen::VectorXd& grid); + const Eigen::VectorXd& grid) const; double interp_on_grid(const double& x, const Eigen::VectorXd& vals, - const Eigen::VectorXd& grid); + const Eigen::VectorXd& grid) const; - ptrdiff_t find_cell(const double& x0); + ptrdiff_t find_cell(const double& x0) const; // Utility functions for integration double int_on_grid(const double& upr, const Eigen::VectorXd& vals, - const Eigen::VectorXd& grid); + const Eigen::VectorXd& grid) const; Eigen::VectorXd grid_points_; Eigen::MatrixXd values_; @@ -88,7 +92,7 @@ inline void InterpolationGrid1d::normalize(int times) } } -inline ptrdiff_t InterpolationGrid1d::find_cell(const double& x0) +inline ptrdiff_t InterpolationGrid1d::find_cell(const double& x0) const { ptrdiff_t low = 0, high = grid_points_.size() - 1; ptrdiff_t mid; @@ -105,7 +109,8 @@ inline ptrdiff_t InterpolationGrid1d::find_cell(const double& x0) //! Interpolation //! @param x vector of evaluation points. -inline Eigen::VectorXd InterpolationGrid1d::interpolate(const Eigen::VectorXd& x) +inline Eigen::VectorXd InterpolationGrid1d::interpolate( + const Eigen::VectorXd& x) const { Eigen::VectorXd tmpgrid(4), tmpvals(4); ptrdiff_t m = grid_points_.size(); @@ -133,6 +138,7 @@ inline Eigen::VectorXd InterpolationGrid1d::interpolate(const Eigen::VectorXd& x //! //! @param x a vector of evaluation points inline Eigen::VectorXd InterpolationGrid1d::integrate(const Eigen::VectorXd& x) + const { auto integrate_one = [this] (const double& xx) { return int_on_grid(xx, this->values_, this->grid_points_); @@ -148,7 +154,7 @@ inline Eigen::VectorXd InterpolationGrid1d::integrate(const Eigen::VectorXd& x) //! @param x evaluation point. //! @param a polynomial coefficients inline double InterpolationGrid1d::cubic_poly(const double& x, - const Eigen::VectorXd& a) + const Eigen::VectorXd& a) const { double x2 = x * x; double x3 = x2 * x; @@ -160,7 +166,7 @@ inline double InterpolationGrid1d::cubic_poly(const double& x, //! @param x evaluation point. //! @param a polynomial coefficients. inline double InterpolationGrid1d::cubic_indef_integral( - const double& x, const Eigen::VectorXd& a) + const double& x, const Eigen::VectorXd& a) const { double x2 = x * x; double x3 = x2 * x; @@ -176,6 +182,7 @@ inline double InterpolationGrid1d::cubic_indef_integral( inline double InterpolationGrid1d::cubic_integral(const double& lower, const double& upper, const Eigen::VectorXd& a) + const { return cubic_indef_integral(upper, a) - cubic_indef_integral(lower, a); } @@ -185,7 +192,7 @@ inline double InterpolationGrid1d::cubic_integral(const double& lower, //! @param vals length 4 vector of function values. //! @param grid length 4 vector of grid points. inline Eigen::VectorXd InterpolationGrid1d::find_coefs( - const Eigen::VectorXd& vals, const Eigen::VectorXd& grid) + const Eigen::VectorXd& vals, const Eigen::VectorXd& grid) const { double dt0 = grid(1) - grid(0); double dt1 = grid(2) - grid(1); @@ -231,6 +238,7 @@ inline Eigen::VectorXd InterpolationGrid1d::find_coefs( inline double InterpolationGrid1d::interp_on_grid(const double& x, const Eigen::VectorXd& vals, const Eigen::VectorXd& grid) + const { double xev = (x - grid(1)) / (grid(2) - grid(1)); // use Gaussian tail for extrapolation @@ -258,6 +266,7 @@ inline double InterpolationGrid1d::interp_on_grid(const double& x, inline double InterpolationGrid1d::int_on_grid(const double& upr, const Eigen::VectorXd& vals, const Eigen::VectorXd& grid) + const { ptrdiff_t m = grid.size(); Eigen::VectorXd tmpvals(4), tmpgrid(4), tmpa(4); @@ -294,3 +303,7 @@ inline double InterpolationGrid1d::int_on_grid(const double& upr, return tmpint; } + +} // end kde1d::interp + +} // end kde1d \ No newline at end of file diff --git a/inst/include/lpdens.hpp b/inst/include/kde1d/kde1d.hpp similarity index 50% rename from inst/include/lpdens.hpp rename to inst/include/kde1d/kde1d.hpp index 2b1aab9..888c8b4 100644 --- a/inst/include/lpdens.hpp +++ b/inst/include/kde1d/kde1d.hpp @@ -1,25 +1,45 @@ #pragma once -#include "interpolation.hpp" -#include "stats.hpp" -#include "tools.hpp" +#include "kde1d/interpolation.hpp" +#include "kde1d/stats.hpp" +#include "kde1d/tools.hpp" +#include "kde1d/dpik.hpp" #include #include +namespace kde1d { + //! Local-polynomial density estimation in 1-d. -class LPDens1d { +class Kde1d { public: // constructors - LPDens1d() {} - LPDens1d(Eigen::VectorXd x, double bw, - double xmin, double xmax, size_t p, - const Eigen::VectorXd& weights = Eigen::VectorXd()); + Kde1d() {} + Kde1d(const Eigen::VectorXd& x, + size_t nlevels = 0, + double bw = NAN, + double mult = 1.0, + double xmin = NAN, + double xmax = NAN, + size_t deg = 2, + const Eigen::VectorXd& weights = Eigen::VectorXd()); + Kde1d(const interp::InterpolationGrid1d& grid, + size_t nlevels = 0, + double xmin = NAN, + double xmax = NAN); + + // statistical functions + Eigen::VectorXd pdf(const Eigen::VectorXd& x) const; + Eigen::VectorXd cdf(const Eigen::VectorXd& x) const; + Eigen::VectorXd quantile(const Eigen::VectorXd& x) const; + Eigen::VectorXd simulate(size_t n, + const std::vector& seeds = {}) const; // getters - Eigen::VectorXd get_values() const {return grid_.get_values();} - Eigen::VectorXd get_grid_points() const {return grid_.get_grid_points();} + Eigen::VectorXd get_values() const { return grid_.get_values(); } + Eigen::VectorXd get_grid_points() const { return grid_.get_grid_points(); } + size_t get_nlevels() const { return nlevels_; } double get_bw() const {return bw_;} - double get_p() const {return deg_;} + double get_deg() const {return deg_;} double get_xmin() const {return xmin_;} double get_xmax() const {return xmax_;} double get_edf() const {return edf_;} @@ -27,16 +47,25 @@ class LPDens1d { private: // data members - InterpolationGrid1d grid_; - double bw_; + interp::InterpolationGrid1d grid_; + size_t nlevels_; double xmin_; double xmax_; - size_t deg_; - double loglik_; - double edf_; + double bw_{NAN}; + size_t deg_{2}; + double loglik_{NAN}; + double edf_{NAN}; static constexpr double K0_ = 0.3989425; // private methods + Eigen::VectorXd pdf_continuous(const Eigen::VectorXd& x) const; + Eigen::VectorXd cdf_continuous(const Eigen::VectorXd& x) const; + Eigen::VectorXd quantile_continuous(const Eigen::VectorXd& x) const; + Eigen::VectorXd pdf_discrete(const Eigen::VectorXd& x) const; + Eigen::VectorXd cdf_discrete(const Eigen::VectorXd& x) const; + Eigen::VectorXd quantile_discrete(const Eigen::VectorXd& x) const; + + void check_levels(const Eigen::VectorXd& x) const; Eigen::VectorXd kern_gauss(const Eigen::VectorXd& x); Eigen::MatrixXd fit_lp(const Eigen::VectorXd& x_ev, const Eigen::VectorXd& x, @@ -55,44 +84,64 @@ class LPDens1d { const Eigen::VectorXd& weights); Eigen::VectorXd finalize_grid(Eigen::VectorXd& grid_points); Eigen::VectorXd without_boundary_ext(const Eigen::VectorXd& grid_points); + double select_bw(const Eigen::VectorXd& x, + double bw, double mult, size_t deg, size_t nlevels, + const Eigen::VectorXd& weights) const; }; //! constructor for fitting the density estimate. +//! @param nlevels number of factor levels; 0 for continuous variables. //! @param x vector of observations //! @param bw positive bandwidth parameter (fixed component). //! @param xmin lower bound for the support of the density, `NaN` means no //! boundary. //! @param xmax upper bound for the support of the density, `NaN` means no //! boundary. -//! @param p order of the local polynomial. +//! @param deg order of the local polynomial. //! @param weights vector of weights for each observation (can be empty). -inline LPDens1d::LPDens1d(Eigen::VectorXd x, - double bw, - double xmin, - double xmax, - size_t deg, - const Eigen::VectorXd& weights) : - bw_(bw), - xmin_(xmin), - xmax_(xmax), - deg_(deg) +inline Kde1d::Kde1d(const Eigen::VectorXd& x, + size_t nlevels, + double bw, + double mult, + double xmin, + double xmax, + size_t deg, + const Eigen::VectorXd& weights) + : nlevels_(nlevels) + , xmin_(xmin) + , xmax_(xmax) + , bw_(bw) + , deg_(deg) { if (weights.size() > 0 && (weights.size() != x.size())) - throw std::runtime_error("x and weights must have the same size"); + throw std::runtime_error("x and weights must have the same size."); + if (deg > 2) + throw std::runtime_error("deg must not be larger than 2."); + check_levels(x); + if (nlevels_ > 0) { + xmin = NAN; + xmax = NAN; + } - // construct grid on original domain - Eigen::VectorXd grid_points = construct_grid_points(x, weights); + // preprocessing for nans and jittering + Eigen::VectorXd xx = x; + Eigen::VectorXd w = weights; + tools::remove_nans(xx, w); + if (w.size() > 0) + w /= w.sum(); + if (nlevels_ > 0) + xx = stats::equi_jitter(xx); - // transform in case of boundary correction - grid_points = boundary_transform(grid_points); - x = boundary_transform(x); + // bandwidth selection + bw_ = select_bw(boundary_transform(xx), bw_, mult, deg, nlevels_, w); - // fit model and evaluate in transformed domain - Eigen::MatrixXd fitted = fit_lp(grid_points, x, weights); + // construct grid on original domain + Eigen::VectorXd grid_points = construct_grid_points(xx, w); - // back-transform grid to original domain - grid_points = boundary_transform(grid_points, true); - x = boundary_transform(x, true); + // fit model and evaluate in transformed domain + Eigen::MatrixXd fitted = fit_lp(boundary_transform(grid_points), + boundary_transform(xx), + w); // correct estimated density for transformation Eigen::VectorXd values = boundary_correct(grid_points, fitted.col(0)); @@ -102,7 +151,7 @@ inline LPDens1d::LPDens1d(Eigen::VectorXd x, // construct interpolation grid // (3 iterations for normalization to a proper density) - grid_ = InterpolationGrid1d(grid_points, values, 3); + grid_ = interp::InterpolationGrid1d(grid_points, values, 3); // store normalized values values = grid_.get_values(); @@ -112,17 +161,163 @@ inline LPDens1d::LPDens1d(Eigen::VectorXd x, // calculate effective degrees of freedom double n = x.size(); - InterpolationGrid1d infl_grid(without_boundary_ext(grid_points), - without_boundary_ext(fitted.col(1) - .cwiseMin(2.0) - .cwiseMax(0)), - 0); + interp::InterpolationGrid1d infl_grid( + without_boundary_ext(grid_points), + without_boundary_ext(fitted.col(1).cwiseMin(2.0).cwiseMax(0)), 0); edf_ = infl_grid.interpolate(x).sum(); } + +//! construct model from an already fit interpolation grid. +//! @param grid the interpolation grid. +//! @param nlevels number of factor levels; 0 for continuous variables. +//! @param xmin lower bound for the support of the density, `NaN` means no +//! boundary. +//! @param xmax upper bound for the support of the density, `NaN` means no +//! boundary. +inline Kde1d::Kde1d(const interp::InterpolationGrid1d& grid, + size_t nlevels, + double xmin, + double xmax) + : grid_(grid) + , nlevels_(nlevels) + , xmin_(xmin) + , xmax_(xmax) +{} + +//! computes the pdf of the kernel density estimate by interpolation. +//! @param x vector of evaluation points. +//! @return a vector of pdf values. +inline Eigen::VectorXd Kde1d::pdf(const Eigen::VectorXd& x) const +{ + return (nlevels_ == 0) ? pdf_continuous(x) : pdf_discrete(x); +} + +inline Eigen::VectorXd Kde1d::pdf_continuous(const Eigen::VectorXd& x) const +{ + Eigen::VectorXd fhat = grid_.interpolate(x); + if (!std::isnan(xmin_)) { + fhat = (x.array() < xmin_).select(Eigen::VectorXd::Zero(x.size()), fhat); + } + if (!std::isnan(xmax_)) { + fhat = (x.array() > xmax_).select(Eigen::VectorXd::Zero(x.size()), fhat); + } + + auto trunc = [] (const double& xx) { return std::max(xx, 0.0); }; + return tools::unaryExpr_or_nan(fhat, trunc);; +} + +inline Eigen::VectorXd Kde1d::pdf_discrete(const Eigen::VectorXd& x) const +{ + check_levels(x); + auto fhat = pdf_continuous(x); + // normalize + Eigen::VectorXd lvs = Eigen::VectorXd::LinSpaced(nlevels_, 0, nlevels_ - 1); + fhat /= grid_.interpolate(lvs).sum(); + return fhat; +} + +//! computes the cdf of the kernel density estimate by numerical integration. +//! @param x vector of evaluation points. +//! @return a vector of cdf values. +inline Eigen::VectorXd Kde1d::cdf(const Eigen::VectorXd& x) const +{ + return (nlevels_ == 0) ? cdf_continuous(x) : cdf_discrete(x); +} + +inline Eigen::VectorXd Kde1d::cdf_continuous(const Eigen::VectorXd& x) const +{ + auto p = grid_.integrate(x); + auto trunc = [] (const double& xx) { + return std::min(std::max(xx, 0.0), 1.0); + }; + return tools::unaryExpr_or_nan(p, trunc); +} + +inline Eigen::VectorXd Kde1d::cdf_discrete(const Eigen::VectorXd& x) const +{ + check_levels(x); + Eigen::VectorXd lvs = Eigen::VectorXd::LinSpaced(nlevels_, 0, nlevels_ - 1); + auto f_cum = pdf(lvs); + for (size_t i = 1; i < nlevels_; ++i) + f_cum(i) += f_cum(i - 1); + + return tools::unaryExpr_or_nan(x, [&f_cum] (const double& xx) { + return f_cum(static_cast(xx)); + }); +} + +//! computes the cdf of the kernel density estimate by numerical inversion. +//! @param x vector of evaluation points. +//! @return a vector of quantiles. +inline Eigen::VectorXd Kde1d::quantile(const Eigen::VectorXd& x) const +{ + if ((x.minCoeff() < 0) | (x.maxCoeff() > 1)) + throw std::runtime_error("probabilities must lie in (0, 1)."); + return (nlevels_ == 0) ? quantile_continuous(x) : quantile_discrete(x); +} + +inline Eigen::VectorXd Kde1d::quantile_continuous(const Eigen::VectorXd& x) const +{ + auto cdf = [&] (const Eigen::VectorXd& xx) { return grid_.integrate(xx); }; + auto q = tools::invert_f(x, + cdf, + grid_.get_grid_points().minCoeff(), + grid_.get_grid_points().maxCoeff(), + 35); + + // replace with NaN where the input was NaN + for (size_t i = 0; i < x.size(); i++) { + if (std::isnan(x(i))) + q(i) = std::numeric_limits::quiet_NaN(); + } + + return q; +} + +inline Eigen::VectorXd Kde1d::quantile_discrete(const Eigen::VectorXd& x) const +{ + Eigen::VectorXd lvs = Eigen::VectorXd::LinSpaced(nlevels_, 0, nlevels_ - 1); + auto p = cdf(lvs); + auto quan = [&] (const double& pp) { + size_t lv = 0; + while ((pp >= p(lv)) && (lv < nlevels_ - 1)) + lv++; + return lvs(lv); + }; + + return tools::unaryExpr_or_nan(x, quan); +} + +//! simulates data from the model. +//! @param n the number of observations to simulate. +//! @param seeds an optional vector of seeds. +//! @return simulated observations from the kernel density. +inline Eigen::VectorXd Kde1d::simulate(size_t n, + const std::vector& seeds) const +{ + auto u = stats::simulate_uniform(n, seeds); + return this->quantile(u); +} + +inline void Kde1d::check_levels(const Eigen::VectorXd& x) const +{ + if (nlevels_ == 0) + return; + if ((x.array() != x.array().round()).any() | (x.minCoeff() < 0)) { + std::cout << x << std::endl; + throw std::runtime_error("x must only contain non-negatives " + " integers when nlevels > 0."); + } + if (x.maxCoeff() > nlevels_) { + throw std::runtime_error("maximum value of 'x' is larger than the " + "number of factor levels."); + } +} + //! Gaussian kernel (truncated at +/- 5). //! @param x vector of evaluation points. -inline Eigen::VectorXd LPDens1d::kern_gauss(const Eigen::VectorXd& x) +inline Eigen::VectorXd Kde1d::kern_gauss(const Eigen::VectorXd& x) { auto f = [] (double xx) { // truncate at +/- 5 @@ -141,9 +336,9 @@ inline Eigen::VectorXd LPDens1d::kern_gauss(const Eigen::VectorXd& x) //! @param weights vector of weights for each observation (can be empty). //! @return a two-column matrix containing the density estimate in the first //! and the influence function in the second column. -inline Eigen::MatrixXd LPDens1d::fit_lp(const Eigen::VectorXd& x_ev, - const Eigen::VectorXd& x, - const Eigen::VectorXd& weights) +inline Eigen::MatrixXd Kde1d::fit_lp(const Eigen::VectorXd& x_ev, + const Eigen::VectorXd& x, + const Eigen::VectorXd& weights) { Eigen::MatrixXd res(x_ev.size(), 2); size_t n = x.size(); @@ -211,12 +406,12 @@ inline Eigen::MatrixXd LPDens1d::fit_lp(const Eigen::VectorXd& x_ev, //! calculate influence for data point for density estimate based on //! quantities pre-computed in `fit_lp()`. -inline double LPDens1d::calculate_infl(const size_t &n, - const double& f0, - const double& b, - const double& bw, - const double& s, - const double& weight) +inline double Kde1d::calculate_infl(const size_t &n, + const double& f0, + const double& b, + const double& bw, + const double& s, + const double& weight) { Eigen::MatrixXd M; double bw2 = std::pow(bw, 2); @@ -251,14 +446,15 @@ inline double LPDens1d::calculate_infl(const size_t &n, //! @param x evaluation points. //! @param inverse whether the inverse transformation should be applied. //! @return the transformed evaluation points. -inline Eigen::VectorXd LPDens1d::boundary_transform(const Eigen::VectorXd& x, - bool inverse) +inline Eigen::VectorXd Kde1d::boundary_transform(const Eigen::VectorXd& x, + bool inverse) { Eigen::VectorXd x_new = x; if (!inverse) { if (!std::isnan(xmin_) & !std::isnan(xmax_)) { // two boundaries -> probit transform - x_new = (x.array() - xmin_ + 5e-5) / (xmax_ - xmin_ + 1e-4); + auto rng = xmax_ - xmin_; + x_new = (x.array() - xmin_ + 5e-5 * rng) / (xmax_ - xmin_ + 1e-4 * rng); x_new = stats::qnorm(x_new); } else if (!std::isnan(xmin_)) { // left boundary -> log transform @@ -272,8 +468,9 @@ inline Eigen::VectorXd LPDens1d::boundary_transform(const Eigen::VectorXd& x, } else { if (!std::isnan(xmin_) & !std::isnan(xmax_)) { // two boundaries -> probit transform - x_new = stats::pnorm(x).array() + xmin_ - 5e-5; - x_new *= (xmax_ - xmin_ + 1e-4); + auto rng = xmax_ - xmin_; + x_new = stats::pnorm(x).array() + xmin_ - 5e-5 * rng; + x_new *= (xmax_ - xmin_ + 1e-4 * rng); } else if (!std::isnan(xmin_)) { // left boundary -> log transform x_new = x.array().exp() + xmin_ - 1e-3; @@ -293,8 +490,8 @@ inline Eigen::VectorXd LPDens1d::boundary_transform(const Eigen::VectorXd& x, //! @param x evaluation points (in original domain). //! @param fhat the density estimate evaluated in the transformed domain. //! @return corrected density estimates at `x`. -inline Eigen::VectorXd LPDens1d::boundary_correct(const Eigen::VectorXd& x, - const Eigen::VectorXd& fhat) +inline Eigen::VectorXd Kde1d::boundary_correct(const Eigen::VectorXd& x, + const Eigen::VectorXd& fhat) { Eigen::VectorXd corr_term(fhat.size()); if (!std::isnan(xmin_) & !std::isnan(xmax_)) { @@ -320,7 +517,7 @@ inline Eigen::VectorXd LPDens1d::boundary_correct(const Eigen::VectorXd& x, //! constructs a grid that is later used for interpolation. //! @param x vector of observations. //! @return a grid of size 50. -inline Eigen::VectorXd LPDens1d::construct_grid_points( +inline Eigen::VectorXd Kde1d::construct_grid_points( const Eigen::VectorXd& x, const Eigen::VectorXd& weights) { @@ -357,7 +554,7 @@ inline Eigen::VectorXd LPDens1d::construct_grid_points( //! moves the boundary points of the grid to xmin/xmax (if non-NaN). //! @param grid_points the grid points. -inline Eigen::VectorXd LPDens1d::finalize_grid(Eigen::VectorXd& grid_points) +inline Eigen::VectorXd Kde1d::finalize_grid(Eigen::VectorXd& grid_points) { if (!std::isnan(xmin_)) grid_points(0) = xmin_; @@ -370,8 +567,8 @@ inline Eigen::VectorXd LPDens1d::finalize_grid(Eigen::VectorXd& grid_points) //! removes the boundary extension from the grid_points (see //! `construct_grid_points`). //! @param grid_points the grid points. -inline Eigen::VectorXd LPDens1d::without_boundary_ext( - const Eigen::VectorXd& grid_points) +inline Eigen::VectorXd Kde1d::without_boundary_ext( + const Eigen::VectorXd& grid_points) { size_t grid_start = 0; size_t grid_size = grid_points.size(); @@ -386,3 +583,31 @@ inline Eigen::VectorXd LPDens1d::without_boundary_ext( return grid_points.segment(grid_start, grid_size); } +// Bandwidth for Kernel Density Estimation +//' @param x vector of observations +//' @param bw bandwidth parameter, NA for automatic selection. +//' @param mult bandwidth multiplier. +//' @param discrete whether a jittered estimate is computed. +//' @param weights vector of weights for each observation (can be empty). +//' @param deg polynomial degree. +//' @return the selected bandwidth +//' @noRd +inline double Kde1d::select_bw(const Eigen::VectorXd& x, + double bw, double mult, size_t deg, + size_t nlevels, + const Eigen::VectorXd& weights) const +{ + if (std::isnan(bw)) { + bw::PluginBandwidthSelector selector(x, weights); + bw = selector.select_bw(deg); + } + + bw *= mult; + if (nlevels > 0) { + bw = std::max(bw, 0.5 / 5); + } + + return bw; +} + +} // end kde1d diff --git a/inst/include/stats.hpp b/inst/include/kde1d/stats.hpp similarity index 58% rename from inst/include/stats.hpp rename to inst/include/kde1d/stats.hpp index 8846ac1..b3c5ac2 100644 --- a/inst/include/stats.hpp +++ b/inst/include/kde1d/stats.hpp @@ -6,6 +6,9 @@ #include #include #include +#include + +namespace kde1d { //! statistical functions namespace stats { @@ -137,4 +140,93 @@ inline Eigen::VectorXd quantile(const Eigen::VectorXd& x, return res; } +// conditionally equidistant jittering; equivalent to the R implementation: +// tab <- table(x) +// noise <- unname(unlist(lapply(tab, function(l) -0.5 + 1:l / (l + 1)))) +// s <- sort(x, index.return = TRUE) +// return((s$x + noise)[rank(x, ties.method = "first", na.last = "keep")]) +inline Eigen::VectorXd equi_jitter(const Eigen::VectorXd& x) +{ + size_t n = x.size(); + + // first compute the corresponding permutation that sorts x (required later) + Eigen::VectorXi perm(n); + for (size_t i = 0; i < x.size(); ++i) + perm(i) = i; + std::stable_sort( + perm.data(), + perm.data() + n, + [&](const size_t& a, const size_t& b) { return (x[a] < x[b]); } + ); + + // actually sort x + Eigen::VectorXd srt(n); + for (size_t i = 0; i < n; ++i) + srt(i) = x(perm(i)); + + // compute contingency table + Eigen::MatrixXd tab(n + 1, 2); + size_t lev = 0; + size_t cnt = 1; + for (size_t k = 1; k < n; ++k) { + if (srt(k - 1) != srt(k)) { + tab(lev, 0) = srt(k - 1); + tab(lev++, 1) = cnt; + cnt = 1; + } else { + cnt++; + if (k == n - 1) + tab(lev++, 1) = cnt; + } + } + tab.conservativeResize(lev, 2); + + // add deterministic, conditionally uniorm noise + Eigen::VectorXd noise(n); + size_t i = 0; + for (size_t k = 0; k < tab.rows(); ++k) { + for (size_t cnt = 1; cnt <= tab(k, 1); ++cnt) + noise(i++) = -0.5 + cnt / (tab(k, 1) + 1.0); + cnt = 1; + } + Eigen::VectorXd jtr = srt + noise; + + // invert the permutation to return jittered x in original order + for (size_t i = 0; i < perm.size(); ++i) + srt(perm(i)) = jtr(i); + + return srt; +} + +//! @brief simulates from the standard uniform distribution. +//! +//! @param n number of observations. +//! @param seeds seeds of the random number generator; if empty (default), +//! the random number generator is seeded randomly. +//! +//! @return An size n vector of independent \f$ \mathrm{U}[0, 1] \f$ random +//! variables. +inline Eigen::VectorXd simulate_uniform(size_t n, std::vector seeds) +{ + if (n < 1) + throw std::runtime_error("n must be at least 1."); + + if (seeds.size() == 0) { // no seeds provided, seed randomly + std::random_device rd{}; + seeds = std::vector(5); + for (auto& s : seeds) + s = static_cast(rd()); + } + + // initialize random engine and uniform distribution + std::seed_seq seq(seeds.begin(), seeds.end()); + std::mt19937 generator(seq); + std::uniform_real_distribution distribution(0.0, 1.0); + + Eigen::VectorXd U(n); + return U.unaryExpr([&](double) { return distribution(generator); }); } + +} // end kde1d::stats + +} // end kde1d diff --git a/inst/include/tools.hpp b/inst/include/kde1d/tools.hpp similarity index 60% rename from inst/include/tools.hpp rename to inst/include/kde1d/tools.hpp index 0aad6c2..f2696eb 100644 --- a/inst/include/tools.hpp +++ b/inst/include/kde1d/tools.hpp @@ -2,6 +2,8 @@ #include +namespace kde1d { + namespace tools { //! applies a function to each non-NaN value, otherwise returns NaN @@ -56,4 +58,36 @@ inline size_t find_min_index(const Eigen::VectorXd& x) return std::min_element(x.data(), x.data() + x.size()) - x.data(); } +//! remove rows of a matrix which contain nan values or have zero weight +//! @param x the matrix. +//! @param a vector of weights that is either empty or whose size is equal to +//! the number of columns of x. +inline void remove_nans(Eigen::VectorXd& x, Eigen::VectorXd& weights) +{ + if ((weights.size() > 0) & (weights.size() != x.rows())) + throw std::runtime_error("sizes of x and weights don't match."); + + // if an entry is nan or weight is zero, move it to the end + size_t last = x.size() - 1; + for (size_t i = 0; i < last + 1; i++) { + bool is_nan = std::isnan(x(i)); + if (weights.size() > 0) { + is_nan = is_nan | std::isnan(weights(i)); + is_nan = is_nan | (weights(i) == 0.0); + } + if (is_nan) { + if (weights.size() > 0) + std::swap(weights(i), weights(last)); + std::swap(x(i--), x(last--)); + } + } + + // remove nan rows + x.conservativeResize(last + 1); + if (weights.size() > 0) + weights.conservativeResize(last + 1); } + +} // end kde1d tools + +} // end kde1d diff --git a/man/dkde1d.Rd b/man/dkde1d.Rd index 9291dc8..187f7bf 100644 --- a/man/dkde1d.Rd +++ b/man/dkde1d.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/kde1d_methods.R +% Please edit documentation in R/kde1d-methods.R \name{dkde1d} \alias{dkde1d} \alias{pkde1d,} diff --git a/man/plot.kde1d.Rd b/man/plot.kde1d.Rd index 894d4a3..dd84093 100644 --- a/man/plot.kde1d.Rd +++ b/man/plot.kde1d.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/kde1d_methods.R +% Please edit documentation in R/kde1d-methods.R \name{plot.kde1d} \alias{plot.kde1d} \alias{lines.kde1d} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index cccff5b..9c68c5d 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -7,80 +7,65 @@ using namespace Rcpp; // fit_kde1d_cpp -Rcpp::List fit_kde1d_cpp(const Eigen::VectorXd& x, double bw, double xmin, double xmax, size_t deg, const Eigen::VectorXd& weights); -RcppExport SEXP _kde1d_fit_kde1d_cpp(SEXP xSEXP, SEXP bwSEXP, SEXP xminSEXP, SEXP xmaxSEXP, SEXP degSEXP, SEXP weightsSEXP) { +Rcpp::List fit_kde1d_cpp(const Eigen::VectorXd& x, size_t nlevels, double bw, double mult, double xmin, double xmax, size_t deg, const Eigen::VectorXd& weights); +RcppExport SEXP _kde1d_fit_kde1d_cpp(SEXP xSEXP, SEXP nlevelsSEXP, SEXP bwSEXP, SEXP multSEXP, SEXP xminSEXP, SEXP xmaxSEXP, SEXP degSEXP, SEXP weightsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Eigen::VectorXd& >::type x(xSEXP); + Rcpp::traits::input_parameter< size_t >::type nlevels(nlevelsSEXP); Rcpp::traits::input_parameter< double >::type bw(bwSEXP); + Rcpp::traits::input_parameter< double >::type mult(multSEXP); Rcpp::traits::input_parameter< double >::type xmin(xminSEXP); Rcpp::traits::input_parameter< double >::type xmax(xmaxSEXP); Rcpp::traits::input_parameter< size_t >::type deg(degSEXP); Rcpp::traits::input_parameter< const Eigen::VectorXd& >::type weights(weightsSEXP); - rcpp_result_gen = Rcpp::wrap(fit_kde1d_cpp(x, bw, xmin, xmax, deg, weights)); + rcpp_result_gen = Rcpp::wrap(fit_kde1d_cpp(x, nlevels, bw, mult, xmin, xmax, deg, weights)); return rcpp_result_gen; END_RCPP } // dkde1d_cpp -Eigen::VectorXd dkde1d_cpp(const Eigen::VectorXd& x, const Rcpp::List& R_object); -RcppExport SEXP _kde1d_dkde1d_cpp(SEXP xSEXP, SEXP R_objectSEXP) { +Eigen::VectorXd dkde1d_cpp(const Eigen::VectorXd& x, const Rcpp::List& kde1d_r); +RcppExport SEXP _kde1d_dkde1d_cpp(SEXP xSEXP, SEXP kde1d_rSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Eigen::VectorXd& >::type x(xSEXP); - Rcpp::traits::input_parameter< const Rcpp::List& >::type R_object(R_objectSEXP); - rcpp_result_gen = Rcpp::wrap(dkde1d_cpp(x, R_object)); + Rcpp::traits::input_parameter< const Rcpp::List& >::type kde1d_r(kde1d_rSEXP); + rcpp_result_gen = Rcpp::wrap(dkde1d_cpp(x, kde1d_r)); return rcpp_result_gen; END_RCPP } // pkde1d_cpp -Eigen::VectorXd pkde1d_cpp(const Eigen::VectorXd& x, const Rcpp::List& R_object); -RcppExport SEXP _kde1d_pkde1d_cpp(SEXP xSEXP, SEXP R_objectSEXP) { +Eigen::VectorXd pkde1d_cpp(const Eigen::VectorXd& q, const Rcpp::List& kde1d_r); +RcppExport SEXP _kde1d_pkde1d_cpp(SEXP qSEXP, SEXP kde1d_rSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const Eigen::VectorXd& >::type x(xSEXP); - Rcpp::traits::input_parameter< const Rcpp::List& >::type R_object(R_objectSEXP); - rcpp_result_gen = Rcpp::wrap(pkde1d_cpp(x, R_object)); + Rcpp::traits::input_parameter< const Eigen::VectorXd& >::type q(qSEXP); + Rcpp::traits::input_parameter< const Rcpp::List& >::type kde1d_r(kde1d_rSEXP); + rcpp_result_gen = Rcpp::wrap(pkde1d_cpp(q, kde1d_r)); return rcpp_result_gen; END_RCPP } // qkde1d_cpp -Eigen::VectorXd qkde1d_cpp(const Eigen::VectorXd& x, const Rcpp::List& R_object); -RcppExport SEXP _kde1d_qkde1d_cpp(SEXP xSEXP, SEXP R_objectSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const Eigen::VectorXd& >::type x(xSEXP); - Rcpp::traits::input_parameter< const Rcpp::List& >::type R_object(R_objectSEXP); - rcpp_result_gen = Rcpp::wrap(qkde1d_cpp(x, R_object)); - return rcpp_result_gen; -END_RCPP -} -// select_bw_cpp -double select_bw_cpp(const Eigen::VectorXd& x, double bw, double mult, bool discrete, const Eigen::VectorXd& weights, size_t deg); -RcppExport SEXP _kde1d_select_bw_cpp(SEXP xSEXP, SEXP bwSEXP, SEXP multSEXP, SEXP discreteSEXP, SEXP weightsSEXP, SEXP degSEXP) { +Eigen::VectorXd qkde1d_cpp(const Eigen::VectorXd& p, const Rcpp::List& kde1d_r); +RcppExport SEXP _kde1d_qkde1d_cpp(SEXP pSEXP, SEXP kde1d_rSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const Eigen::VectorXd& >::type x(xSEXP); - Rcpp::traits::input_parameter< double >::type bw(bwSEXP); - Rcpp::traits::input_parameter< double >::type mult(multSEXP); - Rcpp::traits::input_parameter< bool >::type discrete(discreteSEXP); - Rcpp::traits::input_parameter< const Eigen::VectorXd& >::type weights(weightsSEXP); - Rcpp::traits::input_parameter< size_t >::type deg(degSEXP); - rcpp_result_gen = Rcpp::wrap(select_bw_cpp(x, bw, mult, discrete, weights, deg)); + Rcpp::traits::input_parameter< const Eigen::VectorXd& >::type p(pSEXP); + Rcpp::traits::input_parameter< const Rcpp::List& >::type kde1d_r(kde1d_rSEXP); + rcpp_result_gen = Rcpp::wrap(qkde1d_cpp(p, kde1d_r)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_kde1d_fit_kde1d_cpp", (DL_FUNC) &_kde1d_fit_kde1d_cpp, 6}, + {"_kde1d_fit_kde1d_cpp", (DL_FUNC) &_kde1d_fit_kde1d_cpp, 8}, {"_kde1d_dkde1d_cpp", (DL_FUNC) &_kde1d_dkde1d_cpp, 2}, {"_kde1d_pkde1d_cpp", (DL_FUNC) &_kde1d_pkde1d_cpp, 2}, {"_kde1d_qkde1d_cpp", (DL_FUNC) &_kde1d_qkde1d_cpp, 2}, - {"_kde1d_select_bw_cpp", (DL_FUNC) &_kde1d_select_bw_cpp, 6}, {NULL, NULL, 0} }; diff --git a/src/kde1d-interface.cpp b/src/kde1d-interface.cpp new file mode 100644 index 0000000..5ca84b8 --- /dev/null +++ b/src/kde1d-interface.cpp @@ -0,0 +1,70 @@ +#include +#include "kde1d-wrappers.hpp" + +using namespace kde1d; + +//' fits a kernel density estimate and calculates the effective degrees of +//' freedom. +//' @param x vector of observations; catergorical data must be converted to +//' non-negative integers. +//' @param nlevels the number of factor levels; 0 for continuous data. +//' @param bw the bandwidth parameter. +//' @param xmin lower bound for the support of the density, `NaN` means no +//' boundary. +//' @param xmax upper bound for the support of the density, `NaN` means no +//' boundary. +//' @param deg order of the local polynomial. +//' @return `An Rcpp::List` containing the fitted density values on a grid and +//' additional information. +//' @noRd +// [[Rcpp::export]] +Rcpp::List fit_kde1d_cpp(const Eigen::VectorXd& x, + size_t nlevels, + double bw, + double mult, + double xmin, + double xmax, + size_t deg, + const Eigen::VectorXd& weights) +{ + Kde1d fit(x, nlevels, bw, mult, xmin, xmax, deg, weights); + return kde1d_wrap(fit); +} + +//' computes the pdf of a kernel density estimate by interpolation. +//' @param x vector of evaluation points. +//' @param kde1d_r the fitted object passed from R. +//' @return a vector of pdf values. +//' @noRd +// [[Rcpp::export]] +Eigen::VectorXd dkde1d_cpp(const Eigen::VectorXd& x, + const Rcpp::List& kde1d_r) +{ + return kde1d_wrap(kde1d_r).pdf(x); +} + +//' computes the cdf of a kernel density estimate by numerical integration. +//' @param x vector of evaluation points. +//' @param kde1d_r the fitted object passed from R. +//' @return a vector of cdf values. +//' @noRd +// [[Rcpp::export]] +Eigen::VectorXd pkde1d_cpp(const Eigen::VectorXd& q, + const Rcpp::List& kde1d_r) +{ + return kde1d_wrap(kde1d_r).cdf(q); +} + +//' computes the quantile of a kernel density estimate by numerical inversion +//' (bisection method). +//' @param x vector of evaluation points. +//' @param kde1d_r the fitted object passed from R. +//' @return a vector of quantiles. +//' @noRd +// [[Rcpp::export]] +Eigen::VectorXd qkde1d_cpp(const Eigen::VectorXd& p, + const Rcpp::List& kde1d_r) +{ + return kde1d_wrap(kde1d_r).quantile(p); +} + diff --git a/src/wrappers.cpp b/src/wrappers.cpp deleted file mode 100644 index a29719f..0000000 --- a/src/wrappers.cpp +++ /dev/null @@ -1,152 +0,0 @@ -#include -#include "dpik.hpp" -#include "lpdens.hpp" - -//' fits a kernel density estimate and calculates the effective degrees of -//' freedom. -//' @param x vector of observations. -//' @param bw the bandwidth parameter. -//' @param xmin lower bound for the support of the density, `NaN` means no -//' boundary. -//' @param xmax upper bound for the support of the density, `NaN` means no -//' boundary. -//' @param deg order of the local polynomial. -//' @return `An Rcpp::List` containing the fitted density values on a grid and -//' additional information. -//' @noRd -// [[Rcpp::export]] -Rcpp::List fit_kde1d_cpp(const Eigen::VectorXd& x, - double bw, - double xmin, - double xmax, - size_t deg, - const Eigen::VectorXd& weights) -{ - LPDens1d fit(x, bw, xmin, xmax, deg, weights); - return Rcpp::List::create( - Rcpp::Named("grid_points") = fit.get_grid_points(), - Rcpp::Named("values") = fit.get_values(), - Rcpp::Named("bw") = bw, - Rcpp::Named("xmin") = xmin, - Rcpp::Named("xmax") = xmax, - Rcpp::Named("deg") = deg, - Rcpp::Named("edf") = fit.get_edf(), - Rcpp::Named("loglik") = fit.get_loglik(), - Rcpp::Named("weights") = weights - ); -} - -// converts a fitted R_object ('kde1d') to an interpolation grid in C++. -// @param R_object the fitted object passed from R. -// @return C++ object of class InterpolationGrid1d. -InterpolationGrid1d wrap_to_cpp(const Rcpp::List& R_object) -{ - Eigen::VectorXd grid_points = R_object["grid_points"]; - Eigen::VectorXd values = R_object["values"]; - // 0 -> already normalized during fit - return InterpolationGrid1d(grid_points, values, 0); -} - -//' computes the pdf of a kernel density estimate by interpolation. -//' @param x vector of evaluation points. -//' @param R_object the fitted object passed from R. -//' @return a vector of pdf values. -//' @noRd -// [[Rcpp::export]] -Eigen::VectorXd dkde1d_cpp(const Eigen::VectorXd& x, - const Rcpp::List& R_object) -{ - Eigen::VectorXd fhat = wrap_to_cpp(R_object).interpolate(x); - double xmin = R_object["xmin"]; - double xmax = R_object["xmax"]; - if (!std::isnan(xmin)) { - fhat = (x.array() < xmin).select(Eigen::VectorXd::Zero(x.size()), fhat); - } - if (!std::isnan(xmax)) { - fhat = (x.array() > xmax).select(Eigen::VectorXd::Zero(x.size()), fhat); - } - - auto trunc = [] (const double& p) { - return std::max(p, 0.0); - }; - - return tools::unaryExpr_or_nan(fhat, trunc); -} - -//' computes the cdf of a kernel density estimate by numerical integration. -//' @param x vector of evaluation points. -//' @param R_object the fitted object passed from R. -//' @return a vector of cdf values. -//' @noRd -// [[Rcpp::export]] -Eigen::VectorXd pkde1d_cpp(const Eigen::VectorXd& x, - const Rcpp::List& R_object) -{ - return wrap_to_cpp(R_object).integrate(x).array().max(0.0).min(1.0); -} - -//' computes the quantile of a kernel density estimate by numerical inversion -//' (bisection method). -//' @param x vector of evaluation points. -//' @param R_object the fitted object passed from R. -//' @return a vector of quantiles. -//' @noRd -// [[Rcpp::export]] -Eigen::VectorXd qkde1d_cpp(const Eigen::VectorXd& x, - const Rcpp::List& R_object) -{ - InterpolationGrid1d fit = wrap_to_cpp(R_object); - auto cdf = [&fit] (const Eigen::VectorXd& xx) { - return fit.integrate(xx); - }; - auto q = tools::invert_f(x, - cdf, - fit.get_grid_points().minCoeff(), - fit.get_grid_points().maxCoeff(), - 35); - - // replace with NaN where the input was NaN - for (size_t i = 0; i < x.size(); i++) { - if (std::isnan(x(i))) - q(i) = std::numeric_limits::quiet_NaN(); - } - - return q; -} - -// Bandwidth for Kernel Density Estimation -//' @param x vector of observations -//' @param bw bandwidth parameter, NA for automatic selection. -//' @param mult bandwidth multiplier. -//' @param discrete whether a jittered estimate is computed. -//' @param weights vector of weights for each observation (can be empty). -//' @param deg polynomial degree. -//' @return the selected bandwidth -//' @noRd -// [[Rcpp::export]] -double select_bw_cpp(const Eigen::VectorXd& x, - double bw, double mult, bool discrete, - const Eigen::VectorXd& weights, size_t deg) -{ - if (std::isnan(bw)) { - PluginBandwidthSelector selector(x, weights); - bw = selector.select_bw(deg); - } - - bw *= mult; - if (discrete) { - bw = std::max(bw, 0.5 / 5); - } - - return bw; -} - - -// // [[Rcpp::export]] -// Eigen::VectorXd quan(const Eigen::VectorXd& x, -// const Eigen::VectorXd& a, -// const Eigen::VectorXd& w) { -// if (w.size() > 0) -// return stats::quantile(x, a, w); -// return stats::quantile(x, a); -// } diff --git a/tests/testthat/test_kde1d.R b/tests/testthat/test_kde1d.R index 63c6f50..27c01b3 100644 --- a/tests/testthat/test_kde1d.R +++ b/tests/testthat/test_kde1d.R @@ -1,39 +1,44 @@ context("Testing 'kde1d'") set.seed(0) -n_sim <- 1e2 +n_sim <- 20 data_types <- c( "unbounded", "left_boundary", "right_boundary", "two_boundaries", "discrete" ) deg <- 0:2 -scenarios <- expand.grid(data_types = data_types, deg = deg) +scenarios <- expand.grid(data_types = data_types, + deg = deg, + stringsAsFactors = FALSE) scenarios <- split(scenarios, seq_len(nrow(scenarios))) -fits <- lapply( - scenarios, - function(scenario) { +fits <- as.list(seq_along(scenarios)) +sims <- as.list(seq_along(scenarios)) + +for (k in seq_along(scenarios)) { + test_that(paste0("can fit ", paste(scenarios[[k]], collapse = "/")), { xmin <- xmax <- NaN - if (scenario$data_type == "unbounded") { + if (scenarios[[k]]$data_type == "unbounded") { x <- rnorm(n_sim) - } else if (scenario$data_type == "left_boundary") { + } else if (scenarios[[k]]$data_type == "left_boundary") { x <- rexp(n_sim) xmin <- 0 - } else if (scenario$data_type == "right_boundary") { + } else if (scenarios[[k]]$data_type == "right_boundary") { x <- -rexp(n_sim) xmax <- 0 - } else if (scenario$data_type == "two_boundaries") { + } else if (scenarios[[k]]$data_type == "two_boundaries") { x <- runif(n_sim) xmin <- 0 xmax <- 1 } else { - x <- ordered(rbinom(n_sim, size = 5, prob = 0.5), - levels = 0:5 - ) + x <- ordered(rbinom(n_sim, size = 5, prob = 0.5), levels = 0:5) } - kde1d(x, xmin = xmin, xmax = xmax, deg = scenario$deg) - } -) + sims[[k]] <- x + expect_silent( + fits[[k]] <<- kde1d(x, xmin = xmin, xmax = xmax, deg = scenarios[[k]]$deg) + ) + }) +} test_that("detects wrong arguments", { x <- rnorm(n_sim) @@ -54,29 +59,28 @@ test_that("returns proper 'kde1d' object", { lapply(fits, function(x) expect_s3_class(x, "kde1d")) class_members <- c( - "grid_points", "values", "bw", "xmin", "xmax", "deg", - "edf", "loglik", "weights", "var_name", "nobs", "x" + "grid_points", "values", "nlevels", "bw", "xmin", "xmax", "deg", + "edf", "loglik", "var_name", "nobs", "weights", "x" ) lapply(fits, function(x) expect_identical(names(x), class_members)) }) -test_that("d/p/r/h functions work", { - n <- 50 - u <- runif(n) - test_dpqr <- function(fit, sim) { - sim <- data.frame(sim) - is_jittered <- is.ordered(fit$x) +u <- runif(20) +for (k in seq_along(scenarios)) { + test_that(paste("d/p/r/h works for", paste(scenarios[[k]], collapse = "/")), { + fit <- fits[[k]] + sim <- rkde1d(20, fit) if (is.nan(fit$xmax)) { - xmax <- ifelse(is_jittered, 5, Inf) + xmax <- ifelse(is.ordered(fit$x), 5, Inf) } else { xmax <- fit$xmax } if (is.nan(fit$xmin)) { - xmin <- ifelse(is_jittered, 0, -Inf) + xmin <- ifelse(is.ordered(fit$x), 0, -Inf) } else { xmin <- fit$xmin } - expect_that(all(sim >= xmin), equals(TRUE)) + expect_that(all(sim >= xmin), equals(TRUE), label = scenarios) expect_that(all(sim <= xmax), equals(TRUE)) expect_gte(max(dkde1d(sim, fit), 0), 0) expect_gte(max(pkde1d(sim, fit), 0), 0) @@ -92,14 +96,9 @@ test_that("d/p/r/h functions work", { expect_equal(dkde1d(xmax + 1, fit), 0) expect_equal(pkde1d(xmax + 1, fit), 1) } - } - - sims <- lapply(fits, function(x) rkde1d(n, x)) - mapply(test_dpqr, fits, sims) - sim <- lapply(fits, function(x) rkde1d(n, x, quasi = TRUE)) - mapply(test_dpqr, fits, sims) -}) + }) +} test_that("plot functions work", { test_plot <- function(fit) { @@ -134,8 +133,7 @@ test_that("behavior for discrete data is consistent", { xx <- ordered(1:5, 1:5) expect_equal(dkde1d(1:5, fit), dkde1d(xx, fit)) expect_equal(pkde1d(1:5, fit), pkde1d(xx, fit)) - expect_true(all(is.na(dkde1d(c(0, 6), fit)))) - expect_true(all(is.na(pkde1d(c(0, 6), fit)))) + expect_error(all(is.na(dkde1d(c(0, 6), fit)))) expect_true(all(rkde1d(n, fit) %in% x)) })