From 03d158fc5dfaa30ded5fc62bdd47237cbba1e3a6 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Wed, 2 Oct 2024 15:02:12 +0100 Subject: [PATCH 01/20] initial implementation of basic summary stats matching hierfstat --- R/hierfstat_basic_stats.R | 103 ++++++++++++++++++++++++++ R/pop_basic_stats.R | 103 ++++++++++++++++++++++++++ R/utils.R | 3 + tests/testthat/test_pop_basic_stats.R | 33 +++++++++ 4 files changed, 242 insertions(+) create mode 100644 R/hierfstat_basic_stats.R create mode 100644 R/pop_basic_stats.R create mode 100644 tests/testthat/test_pop_basic_stats.R diff --git a/R/hierfstat_basic_stats.R b/R/hierfstat_basic_stats.R new file mode 100644 index 00000000..8d472413 --- /dev/null +++ b/R/hierfstat_basic_stats.R @@ -0,0 +1,103 @@ +#' Compute basic population statistics +#' +#' This function computes basic population statistics, following the formulas used in `hietfstat`. +#' +#' @param .x a [`gen_tibble`] (usually grouped, as obtained by using [dplyr::group_by()]; use on +#' a single population will return a number of quantities as NA/NaN) +#' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()] +#' @returns a tibble of population statistics, with populations as rows and statistics as columns +#' @export + + +# adapted from hierfstat +hierfstat_basic_stats <- function(.x, n_cores = bigstatsr::nb_cores()){ + stopifnot_diploid(.x) + # get the populations if it is a grouped gen_tibble + if (!inherits(.x,"grouped_df")){ + .group_levels <- .x %>% group_keys() + .group_ids <- dplyr::group_indices(.x)-1 + } else { # create a dummy pop + .group_levels = "pop" + .group_ids <- rep(0,nrow(.x)) + } + + + # summarise population frequencies + pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, + rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = .group_ids, + ngroups = nrow(.group_levels), + ncores = n_cores) + # get the number of individuals + n <-pop_freqs_df$n/2 + # is this correct? + sHo <-pop_freqs_df$het_obs + mHo <- apply(sHo, 1, mean, na.rm = TRUE) + + # sum of squared frequencies + sp2 <- pop_freqs_df$freq_alt^2+pop_freqs_df$freq_ref^2 + Hs <- (1 - sp2 - sHo/2/n) + Hs <- n/(n - 1) * Hs + Fis = 1 - sHo/Hs + + np <- apply(n, 1, fun <- function(x) sum(!is.na(x))) + # mean sample size over the populations + mn <- apply(n, 1, fun <- function(x) { + sum(!is.na(x))/sum(1/x[!is.na(x)]) + }) + # mean sum of square frequencies + msp2 <- apply(sp2, 1, mean, na.rm = TRUE) + mp2 <- rowMeans(pop_freqs_df$freq_alt)^2+rowMeans(pop_freqs_df$freq_ref)^2 + mHs <- mn/(mn - 1) * (1 - msp2 - mHo/2/mn) + Ht <- 1 - mp2 + mHs/mn/np - mHo/2/mn/np + mFis = 1 - mHo/mHs + + Dst <- Ht - mHs + Dstp <- np/(np - 1) * Dst + Htp = mHs + Dstp + Fst = Dst/Ht + Fstp = Dstp/Htp + Dest <- Dstp/(1 - mHs) + res <- data.frame(cbind(mHo, mHs, Ht, Dst, Htp, Dstp, Fst, + Fstp, mFis, Dest)) + names(res) <- c("Ho", "Hs", "Ht", "Dst", "Htp", "Dstp", + "Fst", "Fstp", "Fis", "Dest") + # overall summaries + is.na(res) <- do.call(cbind, lapply(res, is.infinite)) + overall <- apply(res, 2, mean, na.rm = TRUE) + overall[7] <- overall[4]/overall[3] + overall[8] <- overall[6]/overall[5] + overall[9] <- 1 - overall[1]/overall[2] + overall[10] <- overall[6]/(1 - overall[2]) + names(overall) <- names(res) + + digits <- 4 + all.res <- list(n.ind.samp = n, Ho = round(sHo, digits), Hs = round(Hs, digits), + Fis = round(Fis, digits), perloc = round(res, digits), + overall = round(overall, digits)) + class(all.res) <- "basic.stats" + all.res + + + +} + +# loci_het_obs +# loci_het_exp +# loci_gene_div +# loci_dst +# loci_dst_prime +# loci_het_tot +# loci_het_top_prime +# loci_fst +# loci_fst_prime +# loci_fis +# loci_jost_d + +#pop_Ho +#pop_Hs +#pop_Fis + + + diff --git a/R/pop_basic_stats.R b/R/pop_basic_stats.R new file mode 100644 index 00000000..8d472413 --- /dev/null +++ b/R/pop_basic_stats.R @@ -0,0 +1,103 @@ +#' Compute basic population statistics +#' +#' This function computes basic population statistics, following the formulas used in `hietfstat`. +#' +#' @param .x a [`gen_tibble`] (usually grouped, as obtained by using [dplyr::group_by()]; use on +#' a single population will return a number of quantities as NA/NaN) +#' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()] +#' @returns a tibble of population statistics, with populations as rows and statistics as columns +#' @export + + +# adapted from hierfstat +hierfstat_basic_stats <- function(.x, n_cores = bigstatsr::nb_cores()){ + stopifnot_diploid(.x) + # get the populations if it is a grouped gen_tibble + if (!inherits(.x,"grouped_df")){ + .group_levels <- .x %>% group_keys() + .group_ids <- dplyr::group_indices(.x)-1 + } else { # create a dummy pop + .group_levels = "pop" + .group_ids <- rep(0,nrow(.x)) + } + + + # summarise population frequencies + pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, + rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = .group_ids, + ngroups = nrow(.group_levels), + ncores = n_cores) + # get the number of individuals + n <-pop_freqs_df$n/2 + # is this correct? + sHo <-pop_freqs_df$het_obs + mHo <- apply(sHo, 1, mean, na.rm = TRUE) + + # sum of squared frequencies + sp2 <- pop_freqs_df$freq_alt^2+pop_freqs_df$freq_ref^2 + Hs <- (1 - sp2 - sHo/2/n) + Hs <- n/(n - 1) * Hs + Fis = 1 - sHo/Hs + + np <- apply(n, 1, fun <- function(x) sum(!is.na(x))) + # mean sample size over the populations + mn <- apply(n, 1, fun <- function(x) { + sum(!is.na(x))/sum(1/x[!is.na(x)]) + }) + # mean sum of square frequencies + msp2 <- apply(sp2, 1, mean, na.rm = TRUE) + mp2 <- rowMeans(pop_freqs_df$freq_alt)^2+rowMeans(pop_freqs_df$freq_ref)^2 + mHs <- mn/(mn - 1) * (1 - msp2 - mHo/2/mn) + Ht <- 1 - mp2 + mHs/mn/np - mHo/2/mn/np + mFis = 1 - mHo/mHs + + Dst <- Ht - mHs + Dstp <- np/(np - 1) * Dst + Htp = mHs + Dstp + Fst = Dst/Ht + Fstp = Dstp/Htp + Dest <- Dstp/(1 - mHs) + res <- data.frame(cbind(mHo, mHs, Ht, Dst, Htp, Dstp, Fst, + Fstp, mFis, Dest)) + names(res) <- c("Ho", "Hs", "Ht", "Dst", "Htp", "Dstp", + "Fst", "Fstp", "Fis", "Dest") + # overall summaries + is.na(res) <- do.call(cbind, lapply(res, is.infinite)) + overall <- apply(res, 2, mean, na.rm = TRUE) + overall[7] <- overall[4]/overall[3] + overall[8] <- overall[6]/overall[5] + overall[9] <- 1 - overall[1]/overall[2] + overall[10] <- overall[6]/(1 - overall[2]) + names(overall) <- names(res) + + digits <- 4 + all.res <- list(n.ind.samp = n, Ho = round(sHo, digits), Hs = round(Hs, digits), + Fis = round(Fis, digits), perloc = round(res, digits), + overall = round(overall, digits)) + class(all.res) <- "basic.stats" + all.res + + + +} + +# loci_het_obs +# loci_het_exp +# loci_gene_div +# loci_dst +# loci_dst_prime +# loci_het_tot +# loci_het_top_prime +# loci_fst +# loci_fst_prime +# loci_fis +# loci_jost_d + +#pop_Ho +#pop_Hs +#pop_Fis + + + diff --git a/R/utils.R b/R/utils.R index ed5e5f70..20a6b2da 100644 --- a/R/utils.R +++ b/R/utils.R @@ -24,6 +24,9 @@ tidy_dist_matrix<- function(mat){ # stop if not diploid stopifnot_diploid <- function(x){ + if (inherits(x, "gen_tbl")){ + x <- x$genotypes + } if (attr(x,"ploidy")!=2){ stop("this function only works on diploid data") } diff --git a/tests/testthat/test_pop_basic_stats.R b/tests/testthat/test_pop_basic_stats.R new file mode 100644 index 00000000..b8ce571d --- /dev/null +++ b/tests/testthat/test_pop_basic_stats.R @@ -0,0 +1,33 @@ +test_genotypes <- rbind(c(1,1,0,1,1,0), + c(2,1,0,NA,0,0), + c(2,NA,0,0,1,1), + c(1,0,0,1,0,0), + c(1,2,0,1,2,1), + c(0,0,0,0,NA,1), + c(0,1,1,0,1,NA)) +test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"), + population = c("pop1","pop1","pop2","pop2","pop1","pop3","pop3")) +test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=paste0("chr",c(1,1,1,1,2,2)), + position=as.integer(c(3,5,65,343,23,456)), + genetic_dist = as.integer(rep(0,6)), + allele_ref = c("A","T","C","G","C","T"), + allele_alt = c("T","C", NA,"C","G","A")) + +test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE) + + +test_that("pairwise_pop_fst compute correctly",{ + test_gt <- test_gt %>% dplyr::group_by(population) + test_hier <- gt_as_hierfstat(test_gt) + # compare results against hierfstat for Nei87 (Nei86 does not correct for Ho + # when computing Ht, so it gives a different result) + basic_gt <- test_gt %>% hierfstat_basic_stats() + + basic_hier <- hierfstat::basic.stats(test_hier) + # hiefstat values are rounded to 4 dp + expect_true(all.equal(basic_hier$perloc, round(basic_gt,4),check.attributes=FALSE)) + + #pair_fst_locus <- test_gt %>% pairwise_pop_fst(by_locus = TRUE) + +}) From 1a76d69a10ba126049ebbaf805e0448a26c02c2d Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Wed, 2 Oct 2024 15:12:31 +0100 Subject: [PATCH 02/20] fix ungrouped case --- R/hierfstat_basic_stats.R | 6 +-- R/pop_basic_stats.R | 103 -------------------------------------- 2 files changed, 3 insertions(+), 106 deletions(-) delete mode 100644 R/pop_basic_stats.R diff --git a/R/hierfstat_basic_stats.R b/R/hierfstat_basic_stats.R index 8d472413..148153e6 100644 --- a/R/hierfstat_basic_stats.R +++ b/R/hierfstat_basic_stats.R @@ -13,15 +13,14 @@ hierfstat_basic_stats <- function(.x, n_cores = bigstatsr::nb_cores()){ stopifnot_diploid(.x) # get the populations if it is a grouped gen_tibble - if (!inherits(.x,"grouped_df")){ + if (inherits(.x,"grouped_df")){ .group_levels <- .x %>% group_keys() .group_ids <- dplyr::group_indices(.x)-1 } else { # create a dummy pop - .group_levels = "pop" + .group_levels = tibble(population="pop") .group_ids <- rep(0,nrow(.x)) } - # summarise population frequencies pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, rowInd = .gt_bigsnp_rows(.x), @@ -29,6 +28,7 @@ hierfstat_basic_stats <- function(.x, n_cores = bigstatsr::nb_cores()){ groupIds = .group_ids, ngroups = nrow(.group_levels), ncores = n_cores) + # get the number of individuals n <-pop_freqs_df$n/2 # is this correct? diff --git a/R/pop_basic_stats.R b/R/pop_basic_stats.R deleted file mode 100644 index 8d472413..00000000 --- a/R/pop_basic_stats.R +++ /dev/null @@ -1,103 +0,0 @@ -#' Compute basic population statistics -#' -#' This function computes basic population statistics, following the formulas used in `hietfstat`. -#' -#' @param .x a [`gen_tibble`] (usually grouped, as obtained by using [dplyr::group_by()]; use on -#' a single population will return a number of quantities as NA/NaN) -#' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()] -#' @returns a tibble of population statistics, with populations as rows and statistics as columns -#' @export - - -# adapted from hierfstat -hierfstat_basic_stats <- function(.x, n_cores = bigstatsr::nb_cores()){ - stopifnot_diploid(.x) - # get the populations if it is a grouped gen_tibble - if (!inherits(.x,"grouped_df")){ - .group_levels <- .x %>% group_keys() - .group_ids <- dplyr::group_indices(.x)-1 - } else { # create a dummy pop - .group_levels = "pop" - .group_ids <- rep(0,nrow(.x)) - } - - - # summarise population frequencies - pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, - rowInd = .gt_bigsnp_rows(.x), - colInd = .gt_bigsnp_cols(.x), - groupIds = .group_ids, - ngroups = nrow(.group_levels), - ncores = n_cores) - # get the number of individuals - n <-pop_freqs_df$n/2 - # is this correct? - sHo <-pop_freqs_df$het_obs - mHo <- apply(sHo, 1, mean, na.rm = TRUE) - - # sum of squared frequencies - sp2 <- pop_freqs_df$freq_alt^2+pop_freqs_df$freq_ref^2 - Hs <- (1 - sp2 - sHo/2/n) - Hs <- n/(n - 1) * Hs - Fis = 1 - sHo/Hs - - np <- apply(n, 1, fun <- function(x) sum(!is.na(x))) - # mean sample size over the populations - mn <- apply(n, 1, fun <- function(x) { - sum(!is.na(x))/sum(1/x[!is.na(x)]) - }) - # mean sum of square frequencies - msp2 <- apply(sp2, 1, mean, na.rm = TRUE) - mp2 <- rowMeans(pop_freqs_df$freq_alt)^2+rowMeans(pop_freqs_df$freq_ref)^2 - mHs <- mn/(mn - 1) * (1 - msp2 - mHo/2/mn) - Ht <- 1 - mp2 + mHs/mn/np - mHo/2/mn/np - mFis = 1 - mHo/mHs - - Dst <- Ht - mHs - Dstp <- np/(np - 1) * Dst - Htp = mHs + Dstp - Fst = Dst/Ht - Fstp = Dstp/Htp - Dest <- Dstp/(1 - mHs) - res <- data.frame(cbind(mHo, mHs, Ht, Dst, Htp, Dstp, Fst, - Fstp, mFis, Dest)) - names(res) <- c("Ho", "Hs", "Ht", "Dst", "Htp", "Dstp", - "Fst", "Fstp", "Fis", "Dest") - # overall summaries - is.na(res) <- do.call(cbind, lapply(res, is.infinite)) - overall <- apply(res, 2, mean, na.rm = TRUE) - overall[7] <- overall[4]/overall[3] - overall[8] <- overall[6]/overall[5] - overall[9] <- 1 - overall[1]/overall[2] - overall[10] <- overall[6]/(1 - overall[2]) - names(overall) <- names(res) - - digits <- 4 - all.res <- list(n.ind.samp = n, Ho = round(sHo, digits), Hs = round(Hs, digits), - Fis = round(Fis, digits), perloc = round(res, digits), - overall = round(overall, digits)) - class(all.res) <- "basic.stats" - all.res - - - -} - -# loci_het_obs -# loci_het_exp -# loci_gene_div -# loci_dst -# loci_dst_prime -# loci_het_tot -# loci_het_top_prime -# loci_fst -# loci_fst_prime -# loci_fis -# loci_jost_d - -#pop_Ho -#pop_Hs -#pop_Fis - - - From e1ababcbcd5e5c45953afad5b8b1808dbdb354be Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Wed, 2 Oct 2024 15:18:36 +0100 Subject: [PATCH 03/20] small test --- tests/testthat/test_pop_basic_stats.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_pop_basic_stats.R b/tests/testthat/test_pop_basic_stats.R index b8ce571d..7e1f8bbb 100644 --- a/tests/testthat/test_pop_basic_stats.R +++ b/tests/testthat/test_pop_basic_stats.R @@ -26,7 +26,7 @@ test_that("pairwise_pop_fst compute correctly",{ basic_hier <- hierfstat::basic.stats(test_hier) # hiefstat values are rounded to 4 dp - expect_true(all.equal(basic_hier$perloc, round(basic_gt,4),check.attributes=FALSE)) + expect_true(all.equal(basic_hier$perloc, basic_gt$perloc, check.attributes=FALSE)) #pair_fst_locus <- test_gt %>% pairwise_pop_fst(by_locus = TRUE) From 19c14dec3f4c4bc3a4155b787d5a305db0715c61 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Thu, 10 Oct 2024 09:19:38 +0100 Subject: [PATCH 04/20] various descriptive pop statistics --- .Rbuildignore | 1 + NAMESPACE | 8 +- R/indiv_het_obs.R | 10 +-- R/indiv_missingness.R | 12 +-- R/indiv_ploidy.R | 12 +-- R/pop_fis.R | 80 +++++++++++++++++-- R/pop_fst.R | 4 +- R/pop_global_stats.R | 108 ++++++++++++++++++++++++++ R/pop_het_exp.R | 74 ++++++++++++++++++ R/pop_het_obs.R | 58 ++++++++++++++ man/hierfstat_basic_stats.Rd | 20 +++++ man/indiv_het_obs.Rd | 3 - man/indiv_missingness.Rd | 3 - man/indiv_ploidy.Rd | 3 - man/pop_fis.Rd | 28 +++++-- man/pop_fst.Rd | 6 +- man/pop_global_stats.Rd | 49 ++++++++++++ man/pop_het_exp.Rd | 48 ++++++++++++ man/pop_het_obs.Rd | 43 ++++++++++ tests/testthat/test_gen_tibble.R | 4 +- tests/testthat/test_pop_basic_stats.R | 59 +++++++++++--- tests/testthat/test_pop_fis_fst.R | 6 +- 22 files changed, 584 insertions(+), 55 deletions(-) create mode 100644 R/pop_global_stats.R create mode 100644 R/pop_het_exp.R create mode 100644 R/pop_het_obs.R create mode 100644 man/hierfstat_basic_stats.Rd create mode 100644 man/pop_global_stats.Rd create mode 100644 man/pop_het_exp.Rd create mode 100644 man/pop_het_obs.Rd diff --git a/.Rbuildignore b/.Rbuildignore index 36411269..121042f3 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,3 +4,4 @@ ^LICENSE\.md$ ^\.github$ pkgdown/ +_pkgdown.yml diff --git a/NAMESPACE b/NAMESPACE index f4bce78a..867e725b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,13 +24,10 @@ S3method(dplyr_row_slice,grouped_gen_tbl) S3method(gen_tibble,character) S3method(gen_tibble,matrix) S3method(group_by,gen_tbl) -S3method(indiv_het_obs,grouped_df) S3method(indiv_het_obs,tbl_df) S3method(indiv_het_obs,vctrs_bigSNP) -S3method(indiv_missingness,grouped_df) S3method(indiv_missingness,tbl_df) S3method(indiv_missingness,vctrs_bigSNP) -S3method(indiv_ploidy,grouped_df) S3method(indiv_ploidy,tbl_df) S3method(indiv_ploidy,vctrs_bigSNP) S3method(loci_alt_freq,grouped_df) @@ -106,6 +103,7 @@ export(gt_roh_window) export(gt_save) export(gt_set_imputed) export(gt_uses_imputed) +export(hierfstat_basic_stats) export(indiv_het_obs) export(indiv_missingness) export(indiv_ploidy) @@ -124,6 +122,10 @@ export(pairwise_king) export(pairwise_pop_fst) export(pop_fis) export(pop_fst) +export(pop_gene_div) +export(pop_global_stats) +export(pop_het_exp) +export(pop_het_obs) export(qc_report_indiv) export(qc_report_loci) export(rbind_dry_run) diff --git a/R/indiv_het_obs.R b/R/indiv_het_obs.R index f8a11932..ac4522e0 100644 --- a/R/indiv_het_obs.R +++ b/R/indiv_het_obs.R @@ -48,8 +48,8 @@ indiv_het_obs.vctrs_bigSNP <- function(.x, ...){ this_col_1_na[1,]/(ncol(X)-this_col_1_na[2,]) } -#' @export -#' @rdname indiv_het_obs -indiv_het_obs.grouped_df <- function(.x, ...){ - .x %>% mutate(indiv_het_obs = indiv_het_obs(.data$genotypes)) %>% summarise(het_obs = mean(indiv_het_obs)) -} +# #' @export +# #' @rdname indiv_het_obs +# indiv_het_obs.grouped_df <- function(.x, ...){ +# .x %>% mutate(indiv_het_obs = indiv_het_obs(.data$genotypes)) %>% summarise(het_obs = mean(indiv_het_obs)) +# } diff --git a/R/indiv_missingness.R b/R/indiv_missingness.R index 91ec948b..a3c6e959 100644 --- a/R/indiv_missingness.R +++ b/R/indiv_missingness.R @@ -42,9 +42,9 @@ indiv_missingness.vctrs_bigSNP <- function(.x, as_counts = FALSE, ...){ row_na } -#' @export -#' @rdname indiv_missingness -indiv_missingness.grouped_df <- function(.x, as_counts = FALSE, ...){ - .x %>% mutate(indiv_missingness = indiv_missingness(.data$genotypes, as_counts = as_counts)) %>% - summarise(het_obs = mean(indiv_missingness)) -} +#' #' @export +#' #' @rdname indiv_missingness +#' indiv_missingness.grouped_df <- function(.x, as_counts = FALSE, ...){ +#' .x %>% mutate(indiv_missingness = indiv_missingness(.data$genotypes, as_counts = as_counts)) %>% +#' summarise(het_obs = mean(indiv_missingness)) +#' } diff --git a/R/indiv_ploidy.R b/R/indiv_ploidy.R index ccedd88c..fc92fab7 100644 --- a/R/indiv_ploidy.R +++ b/R/indiv_ploidy.R @@ -31,9 +31,9 @@ indiv_ploidy.vctrs_bigSNP <- function(.x, ...){ } } -#' @export -#' @rdname indiv_ploidy -indiv_ploidy.grouped_df <- function(.x, ...){ - .x %>% mutate(indiv_ploidy = indiv_ploidy(.data$genotypes)) %>% - summarise(ploidy = mean(indiv_ploidy)) -} +#' #' @export +#' #' @rdname indiv_ploidy +#' indiv_ploidy.grouped_df <- function(.x, ...){ +#' .x %>% mutate(indiv_ploidy = indiv_ploidy(.data$genotypes)) %>% +#' summarise(ploidy = mean(indiv_ploidy)) +#' } diff --git a/R/pop_fis.R b/R/pop_fis.R index 268dcc9a..109a1b55 100644 --- a/R/pop_fis.R +++ b/R/pop_fis.R @@ -1,17 +1,87 @@ #' Compute population specific FIS #' -#' This function computes population specific FIS (as computed by [hierfstat::fis.dosage()]). +#' This function computes population specific FIS, using either the approach of Nei 1987 ((as computed by [hierfstat::basic.stats()]).) or of Weir and Goudet 2017 +#' (as computed by [hierfstat::fis.dosage()]). +#' @references +#' Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press +#' Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085 #' @param .x a grouped [`gen_tibble`] (as obtained by using [dplyr::group_by()]) -#' @param include_global boolean determining whether, besides the population specific fis, a global -#' fis should be appended. Note that this will return a vector of n populations plus 1 (the global value) -#' @param allele_sharing_mat optional, the matrix of Allele Sharing returned by +#' @param method one of "Nei87" (based on Nei 1987) or "WG17" (for Weir and Goudet 2017) to compute FIS +#' @param by_locus boolean, determining whether FIS should be returned by locus(TRUE), +#' or as a single genome wide value (FALSE, the default). Note that this is only relevant for "Nei87", +#' as "WG17" always returns a single value. +#' @param include_global boolean determining whether, besides the population specific estiamtes, a global +#' estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value), +#' or a matrix with n+1 columns if `by_locus=TRUE`. +#' @param allele_sharing_mat optional and only relevant for "WG17", the matrix of Allele Sharing returned by #' [pairwise_allele_sharing()] with `as_matrix=TRUE`. As a number of statistics can be #' derived from the Allele Sharing matrix, #' it it sometimes more efficient to pre-compute this matrix. #' @returns a vector of population specific fis (plus the global value if `include_global=TRUE`) #' @export -pop_fis <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){ +pop_fis <- function(.x, method = c("Nei87", "WG17"), by_locus = FALSE, include_global=FALSE, allele_sharing_mat = NULL){ + method <- match.arg(method) + if (method == "Nei87"){ + if (!is.null(allele_sharing_mat)){ + stop("allele_sharing_mat not relevant for Nei87") + } + pop_fis_nei87(.x, by_locus = by_locus, include_global = include_global) + } else if (method == "WG17"){ + if (by_locus){ + stop("by_locus not implemented for WG17") + } + pop_fis_wg17(.x, include_global=include_global, allele_sharing_mat = allele_sharing_mat) + } +} + +pop_fis_nei87 <- function(.x, by_locus = FALSE, include_global=include_global, + n_cores = bigstatsr::nb_cores()){ + stopifnot_diploid(.x) + # get the populations if it is a grouped gen_tibble + if (inherits(.x,"grouped_df")){ + .group_levels <- .x %>% group_keys() + .group_ids <- dplyr::group_indices(.x)-1 + } else { # create a dummy pop + .group_levels = tibble(population="pop") + .group_ids <- rep(0,nrow(.x)) + } + + # summarise population frequencies + pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, + rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = .group_ids, + ngroups = nrow(.group_levels), + ncores = n_cores) + sHo <-pop_freqs_df$het_obs + mHo <- apply(sHo, 1, mean, na.rm = TRUE) + n <-pop_freqs_df$n/2 + # sum of squared frequencies + sp2 <- pop_freqs_df$freq_alt^2+pop_freqs_df$freq_ref^2 + Hs <- (1 - sp2 - sHo/2/n) + Hs <- n/(n - 1) * Hs + Fis = 1 - sHo/Hs + + colnames(Fis)<- .group_levels %>% dplyr::pull(1) + if (by_locus){ + if (include_global){ + global <- (.x %>% pop_global_stats(by_locus = TRUE, n_cores = n_cores))$Fis + Fis <- cbind(Fis, global) + } + } else { + Fis <- colMeans(Fis, na.rm = TRUE) + if (include_global){ + global <- (.x %>% pop_global_stats(by_locus = FALSE, n_cores = n_cores))["Fis"] + names(global) <- "global" + Fis <- c(Fis, global) + } + } + return(Fis) +} + + +pop_fis_wg17 <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){ if (!inherits(.x,"grouped_df")){ stop (".x should be a grouped df") } diff --git a/R/pop_fst.R b/R/pop_fst.R index 99e84280..61e9c844 100644 --- a/R/pop_fst.R +++ b/R/pop_fst.R @@ -1,6 +1,8 @@ #' Compute population specific Fst #' -#' This function computes population specific Fst (as computed by [hierfstat::fst.dosage()]). +#' This function computes population specific Fst, using the approach in Weir and Goudet 2017 +#' (as computed by [hierfstat::fst.dosage()]). +#' @references Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085 #' @param .x a grouped [`gen_tibble`] (as obtained by using [dplyr::group_by()]) #' @param include_global boolean determining whether, besides the population specific Fst, a global #' Fst should be appended. Note that this will return a vector of n populations plus 1 (the global value) diff --git a/R/pop_global_stats.R b/R/pop_global_stats.R new file mode 100644 index 00000000..dcffd63f --- /dev/null +++ b/R/pop_global_stats.R @@ -0,0 +1,108 @@ +#' Compute basic population global statistics +#' +#' This function computes basic population global statistics, following the formulas used in `hietfstat::basic.stats`. +#' +#' @details TO BE CORRECTED (THIS IS INCOMPLETE, AND FORMULAE WERE FILLED IN BY COPILOT) +#' The statistics computed are: +#' - observed heterozygosity (`Ho`) +#' - expected heterozygosity (`Hs`) +#' - total heterozygosity (`Ht`) +#' - within population diversity (`Dst`) +#' - total population diversity (`Htp`) +#' - total population diversity (`Dstp`) +#' - Fst +#' - Fst prime +#' - Fis +#' - Dest +#' @details The formulas used are: +#' \eqn{Ho= 1-\sum_k \sum_i P_{k,ii}/n_p} +#' where \eqn{P_{k,ii}} represents the proportion of homozygote \eqn{i} in sample \eqn{k} and +#' \eqn{n_{p}} the number of samples. +#' \eqn{Hs=\tilde{n}/(\tilde{n}-1)[1-\sum_i\bar{p_i^2}-Ho/2\tilde{n}]} +#' where \eqn{\tilde{n}=np/\sum_k 1/n_k} and +#' \eqn{\bar{p_i^2}=\sum_k p_{ki}^2/np} +#' \eqn{Ht=1-\sum_k \bar{p_k^2} + \bar{Hs}/\tilde{n}/np - Ho/2/\tilde{n}/np} +#' \eqn{Dst=Ht - Hs} +#' +#' +#' @param .x a [`gen_tibble`] (usually grouped, as obtained by using [dplyr::group_by()]; use on +#' a single population will return a number of quantities as NA/NaN) +#' @param by_locus boolean, determining whether the statistics should be returned by locus(TRUE), +#' or as a single genome wide value (FALSE, the default). +#' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()] +#' @returns a tibble of population statistics, with populations as rows and statistics as columns +#' @export + + +# this code is adapted from hierfstat::basic.stats by Jerome Goudet +pop_global_stats <- function(.x, by_locus = FALSE, n_cores = bigstatsr::nb_cores()){ + stopifnot_diploid(.x) + # get the populations if it is a grouped gen_tibble + if (inherits(.x,"grouped_df")){ + .group_levels <- .x %>% group_keys() + .group_ids <- dplyr::group_indices(.x)-1 + } else { # create a dummy pop + .group_levels = tibble(population="pop") + .group_ids <- rep(0,nrow(.x)) + } + + # summarise population frequencies + pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, + rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = .group_ids, + ngroups = nrow(.group_levels), + ncores = n_cores) + + # get the number of individuals + n <-pop_freqs_df$n/2 + # is this correct? + sHo <-pop_freqs_df$het_obs + mHo <- apply(sHo, 1, mean, na.rm = TRUE) + + # sum of squared frequencies + sp2 <- pop_freqs_df$freq_alt^2+pop_freqs_df$freq_ref^2 +# Hs <- (1 - sp2 - sHo/2/n) +# Hs <- n/(n - 1) * Hs +# Fis = 1 - sHo/Hs + + np <- apply(n, 1, fun <- function(x) sum(!is.na(x))) + # mean sample size over the populations + mn <- apply(n, 1, fun <- function(x) { + sum(!is.na(x))/sum(1/x[!is.na(x)]) + }) + # mean sum of square frequencies + msp2 <- apply(sp2, 1, mean, na.rm = TRUE) + mp2 <- rowMeans(pop_freqs_df$freq_alt)^2+rowMeans(pop_freqs_df$freq_ref)^2 + mHs <- mn/(mn - 1) * (1 - msp2 - mHo/2/mn) + Ht <- 1 - mp2 + mHs/mn/np - mHo/2/mn/np + mFis = 1 - mHo/mHs + + Dst <- Ht - mHs + Dstp <- np/(np - 1) * Dst + Htp = mHs + Dstp + Fst = Dst/Ht + Fstp = Dstp/Htp + Dest <- Dstp/(1 - mHs) + res <- data.frame(cbind(mHo, mHs, Ht, Dst, Htp, Dstp, Fst, + Fstp, mFis, Dest)) + names(res) <- c("Ho", "Hs", "Ht", "Dst", "Htp", "Dstp", + "Fst", "Fstp", "Fis", "Dest") + + if (by_locus){ + return(res) + } else { + # overall summaries + is.na(res) <- do.call(cbind, lapply(res, is.infinite)) + overall <- apply(res, 2, mean, na.rm = TRUE) + overall[7] <- overall[4]/overall[3] + overall[8] <- overall[6]/overall[5] + overall[9] <- 1 - overall[1]/overall[2] + overall[10] <- overall[6]/(1 - overall[2]) + names(overall) <- names(res) + return(overall) + } +} + + + diff --git a/R/pop_het_exp.R b/R/pop_het_exp.R new file mode 100644 index 00000000..d434cc23 --- /dev/null +++ b/R/pop_het_exp.R @@ -0,0 +1,74 @@ +#' Compute the population expected heterozygosity +#' +#' This function computes expected population heterozygosity (also +#' referred to as gene diversity, to avoid the potentially misleading use of the term "expected" in this context), using the formula of Nei (1987). +#' +#' @details Within population expected heterozygosity (gene diversity) is defined as: +#' \eqn{Hs=\tilde{n}/(\tilde{n}-1)[1-\sum_i\bar{p_i^2}-Ho/2\tilde{n}],} +#' +#' where \eqn{\tilde{n}=np/\sum_k 1/n_k} and +#' \eqn{\bar{p_i^2}=\sum_k p_{ki}^2/np} +#' following Nei(1987) in pp.164--5 +#' +#' @references Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press +#' +#' @param .x a [`gen_tibble`] (usually grouped, as obtained by using [dplyr::group_by()], otherwise the full tibble +#' will be considered as belonging to a single population). +#' @param by_locus boolean, determining whether Hs should be returned by locus(TRUE), +#' or as a single genome wide value (FALSE, the default). +#' @param include_global boolean determining whether, besides the population specific estiamtes, a global +#' estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value), +#' or a matrix with n+1 columns if `by_locus=TRUE`. +#' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()] +#' @returns a vector of mean population observed heterozygosities (if `by_locus=FALSE`), or a matrix of +#' estimates by locus (rows are loci, columns are populations, `by_locus=TRUE`) +#' @export + + +# adapted from hierfstat +pop_het_exp <- function(.x, by_locus = FALSE, include_global = FALSE, n_cores = bigstatsr::nb_cores()){ + stopifnot_diploid(.x) + # get the populations if it is a grouped gen_tibble + if (inherits(.x,"grouped_df")){ + .group_levels <- .x %>% group_keys() + .group_ids <- dplyr::group_indices(.x)-1 + } else { # create a dummy pop + .group_levels = tibble(population="pop") + .group_ids <- rep(0,nrow(.x)) + } + + # summarise population frequencies + pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, + rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = .group_ids, + ngroups = nrow(.group_levels), + ncores = n_cores) + sHo <-pop_freqs_df$het_obs + mHo <- apply(sHo, 1, mean, na.rm = TRUE) + n <-pop_freqs_df$n/2 + # sum of squared frequencies + sp2 <- pop_freqs_df$freq_alt^2+pop_freqs_df$freq_ref^2 + Hs <- (1 - sp2 - sHo/2/n) + Hs <- n/(n - 1) * Hs + + colnames(Hs)<- .group_levels %>% dplyr::pull(1) + if (include_global){ + global <- (.x %>% pop_global_stats(by_locus = TRUE, n_cores = n_cores))$Hs + Hs <- cbind(Hs, global) + } + + if (by_locus){ + return(Hs) + } else { + return(colMeans(Hs, na.rm = TRUE)) + } +} + + +# alias for gene diversity +#' @export +#' @rdname pop_het_exp +pop_gene_div <- function (...){ + pop_het_exp(...) +} diff --git a/R/pop_het_obs.R b/R/pop_het_obs.R new file mode 100644 index 00000000..c0e180c8 --- /dev/null +++ b/R/pop_het_obs.R @@ -0,0 +1,58 @@ +#' Compute the population observed heterozygosity +#' +#' This function computes population heterozygosity, using the formula of Nei (1987). +#' +#' @details Within population observed heterozygosity is defined as: +#' \eqn{Ho= 1-\sum_k \sum_i P_{k,ii}/n_p} +#' where \eqn{P_{k,ii}} represents the proportion of homozygote \eqn{i} in sample \eqn{k} and +#' \eqn{n_{p}} the number of samples. +#' following Nei(1987) in pp.164--5 +#' +#' @references Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press +#' +#' @param .x a [`gen_tibble`] (usually grouped, as obtained by using [dplyr::group_by()], otherwise the full tibble +#' will be considered as belonging to a single population). +#' @param by_locus boolean, determining whether Ho should be returned by locus(TRUE), +#' or as a single genome wide value (FALSE, the default). +#' @param include_global boolean determining whether, besides the population specific estiamtes, a global +#' estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value), +#' or a matrix with n+1 columns if `by_locus=TRUE`. +#' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()] +#' @returns a vector of mean population observed heterozygosities (if `by_locus=FALSE`), or a matrix of +#' estimates by locus (rows are loci, clumns are populations, `by_locus=TRUE`) +#' @export + + +# adapted from hierfstat +pop_het_obs <- function(.x, by_locus = FALSE, include_global = FALSE, n_cores = bigstatsr::nb_cores()){ + stopifnot_diploid(.x) + # get the populations if it is a grouped gen_tibble + if (inherits(.x,"grouped_df")){ + .group_levels <- .x %>% group_keys() + .group_ids <- dplyr::group_indices(.x)-1 + } else { # create a dummy pop + .group_levels = tibble(population="pop") + .group_ids <- rep(0,nrow(.x)) + } + + # summarise population frequencies + pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, + rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = .group_ids, + ngroups = nrow(.group_levels), + ncores = n_cores) + + Ho <- pop_freqs_df$het_obs + colnames(Ho)<- .group_levels %>% dplyr::pull(1) + if (include_global){ + global <- (.x %>% pop_global_stats(by_locus = TRUE, n_cores = n_cores))$Ho + Ho <- cbind(Ho, global) + } + + if (by_locus){ + return(Ho) + } else { + return(colMeans(Ho, na.rm = TRUE)) + } +} diff --git a/man/hierfstat_basic_stats.Rd b/man/hierfstat_basic_stats.Rd new file mode 100644 index 00000000..888262d8 --- /dev/null +++ b/man/hierfstat_basic_stats.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hierfstat_basic_stats.R +\name{hierfstat_basic_stats} +\alias{hierfstat_basic_stats} +\title{Compute basic population statistics} +\usage{ +hierfstat_basic_stats(.x, n_cores = bigstatsr::nb_cores()) +} +\arguments{ +\item{.x}{a \code{\link{gen_tibble}} (usually grouped, as obtained by using \code{\link[dplyr:group_by]{dplyr::group_by()}}; use on +a single population will return a number of quantities as NA/NaN)} + +\item{n_cores}{number of cores to be used, it defaults to \code{\link[bigstatsr:reexports]{bigstatsr::nb_cores()}}} +} +\value{ +a tibble of population statistics, with populations as rows and statistics as columns +} +\description{ +This function computes basic population statistics, following the formulas used in \code{hietfstat}. +} diff --git a/man/indiv_het_obs.Rd b/man/indiv_het_obs.Rd index ff5da3dd..4c6c6cfb 100644 --- a/man/indiv_het_obs.Rd +++ b/man/indiv_het_obs.Rd @@ -4,7 +4,6 @@ \alias{indiv_het_obs} \alias{indiv_het_obs.tbl_df} \alias{indiv_het_obs.vctrs_bigSNP} -\alias{indiv_het_obs.grouped_df} \title{Estimate individual observed heterozygosity} \usage{ indiv_het_obs(.x, ...) @@ -12,8 +11,6 @@ indiv_het_obs(.x, ...) \method{indiv_het_obs}{tbl_df}(.x, ...) \method{indiv_het_obs}{vctrs_bigSNP}(.x, ...) - -\method{indiv_het_obs}{grouped_df}(.x, ...) } \arguments{ \item{.x}{a vector of class \code{vctrs_bigSNP} (usually the \code{genotype} column of diff --git a/man/indiv_missingness.Rd b/man/indiv_missingness.Rd index 140f610f..ebec8198 100644 --- a/man/indiv_missingness.Rd +++ b/man/indiv_missingness.Rd @@ -4,7 +4,6 @@ \alias{indiv_missingness} \alias{indiv_missingness.tbl_df} \alias{indiv_missingness.vctrs_bigSNP} -\alias{indiv_missingness.grouped_df} \title{Estimate individual missingness} \usage{ indiv_missingness(.x, as_counts = FALSE, ...) @@ -12,8 +11,6 @@ indiv_missingness(.x, as_counts = FALSE, ...) \method{indiv_missingness}{tbl_df}(.x, as_counts = FALSE, ...) \method{indiv_missingness}{vctrs_bigSNP}(.x, as_counts = FALSE, ...) - -\method{indiv_missingness}{grouped_df}(.x, as_counts = FALSE, ...) } \arguments{ \item{.x}{a vector of class \code{vctrs_bigSNP} (usually the \code{genotype} column of diff --git a/man/indiv_ploidy.Rd b/man/indiv_ploidy.Rd index f67ba4e3..7210b697 100644 --- a/man/indiv_ploidy.Rd +++ b/man/indiv_ploidy.Rd @@ -4,7 +4,6 @@ \alias{indiv_ploidy} \alias{indiv_ploidy.tbl_df} \alias{indiv_ploidy.vctrs_bigSNP} -\alias{indiv_ploidy.grouped_df} \title{Return individual ploidy} \usage{ indiv_ploidy(.x, ...) @@ -12,8 +11,6 @@ indiv_ploidy(.x, ...) \method{indiv_ploidy}{tbl_df}(.x, ...) \method{indiv_ploidy}{vctrs_bigSNP}(.x, ...) - -\method{indiv_ploidy}{grouped_df}(.x, ...) } \arguments{ \item{.x}{a \code{\link{gen_tibble}}, or a vector of class \code{vctrs_bigSNP} (usually the \code{genotype} column of diff --git a/man/pop_fis.Rd b/man/pop_fis.Rd index 0c41ba4c..923bc984 100644 --- a/man/pop_fis.Rd +++ b/man/pop_fis.Rd @@ -4,15 +4,28 @@ \alias{pop_fis} \title{Compute population specific FIS} \usage{ -pop_fis(.x, include_global = FALSE, allele_sharing_mat = NULL) +pop_fis( + .x, + method = c("Nei87", "WG17"), + by_locus = FALSE, + include_global = FALSE, + allele_sharing_mat = NULL +) } \arguments{ \item{.x}{a grouped \code{\link{gen_tibble}} (as obtained by using \code{\link[dplyr:group_by]{dplyr::group_by()}})} -\item{include_global}{boolean determining whether, besides the population specific fis, a global -fis should be appended. Note that this will return a vector of n populations plus 1 (the global value)} +\item{method}{one of "Nei87" (based on Nei 1987) or "WG17" (for Weir and Goudet 2017) to compute FIS} -\item{allele_sharing_mat}{optional, the matrix of Allele Sharing returned by +\item{by_locus}{boolean, determining whether FIS should be returned by locus(TRUE), +or as a single genome wide value (FALSE, the default). Note that this is only relevant for "Nei87", +as "WG17" always returns a single value.} + +\item{include_global}{boolean determining whether, besides the population specific estiamtes, a global +estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value), +or a matrix with n+1 columns if \code{by_locus=TRUE}.} + +\item{allele_sharing_mat}{optional and only relevant for "WG17", the matrix of Allele Sharing returned by \code{\link[=pairwise_allele_sharing]{pairwise_allele_sharing()}} with \code{as_matrix=TRUE}. As a number of statistics can be derived from the Allele Sharing matrix, it it sometimes more efficient to pre-compute this matrix.} @@ -21,5 +34,10 @@ it it sometimes more efficient to pre-compute this matrix.} a vector of population specific fis (plus the global value if \code{include_global=TRUE}) } \description{ -This function computes population specific FIS (as computed by \code{\link[hierfstat:fs.dosage]{hierfstat::fis.dosage()}}). +This function computes population specific FIS, using either the approach of Nei 1987 ((as computed by \code{\link[hierfstat:basic.stats]{hierfstat::basic.stats()}}).) or of Weir and Goudet 2017 +(as computed by \code{\link[hierfstat:fs.dosage]{hierfstat::fis.dosage()}}). +} +\references{ +Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press +Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085 } diff --git a/man/pop_fst.Rd b/man/pop_fst.Rd index f606c26c..a432aff1 100644 --- a/man/pop_fst.Rd +++ b/man/pop_fst.Rd @@ -20,5 +20,9 @@ derived from the Allele Sharing matrix,} a vector of population specific Fst (plus the global value if \code{include_global=TRUE}) } \description{ -This function computes population specific Fst (as computed by \code{\link[hierfstat:fs.dosage]{hierfstat::fst.dosage()}}). +This function computes population specific Fst, using the approach in Weir and Goudet 2017 +(as computed by \code{\link[hierfstat:fs.dosage]{hierfstat::fst.dosage()}}). +} +\references{ +Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085 } diff --git a/man/pop_global_stats.Rd b/man/pop_global_stats.Rd new file mode 100644 index 00000000..ff0e002b --- /dev/null +++ b/man/pop_global_stats.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pop_global_stats.R +\name{pop_global_stats} +\alias{pop_global_stats} +\title{Compute basic population global statistics} +\usage{ +pop_global_stats(.x, by_locus = FALSE, n_cores = bigstatsr::nb_cores()) +} +\arguments{ +\item{.x}{a \code{\link{gen_tibble}} (usually grouped, as obtained by using \code{\link[dplyr:group_by]{dplyr::group_by()}}; use on +a single population will return a number of quantities as NA/NaN)} + +\item{by_locus}{boolean, determining whether the statistics should be returned by locus(TRUE), +or as a single genome wide value (FALSE, the default).} + +\item{n_cores}{number of cores to be used, it defaults to \code{\link[bigstatsr:reexports]{bigstatsr::nb_cores()}}} +} +\value{ +a tibble of population statistics, with populations as rows and statistics as columns +} +\description{ +This function computes basic population global statistics, following the formulas used in \code{hietfstat::basic.stats}. +} +\details{ +TO BE CORRECTED (THIS IS INCOMPLETE, AND FORMULAE WERE FILLED IN BY COPILOT) +The statistics computed are: +\itemize{ +\item observed heterozygosity (\code{Ho}) +\item expected heterozygosity (\code{Hs}) +\item total heterozygosity (\code{Ht}) +\item within population diversity (\code{Dst}) +\item total population diversity (\code{Htp}) +\item total population diversity (\code{Dstp}) +\item Fst +\item Fst prime +\item Fis +\item Dest +} + +The formulas used are: +\eqn{Ho= 1-\sum_k \sum_i P_{k,ii}/n_p} +where \eqn{P_{k,ii}} represents the proportion of homozygote \eqn{i} in sample \eqn{k} and +\eqn{n_{p}} the number of samples. +\eqn{Hs=\tilde{n}/(\tilde{n}-1)[1-\sum_i\bar{p_i^2}-Ho/2\tilde{n}]} +where \eqn{\tilde{n}=np/\sum_k 1/n_k} and +\eqn{\bar{p_i^2}=\sum_k p_{ki}^2/np} +\eqn{Ht=1-\sum_k \bar{p_k^2} + \bar{Hs}/\tilde{n}/np - Ho/2/\tilde{n}/np} +\eqn{Dst=Ht - Hs} +} diff --git a/man/pop_het_exp.Rd b/man/pop_het_exp.Rd new file mode 100644 index 00000000..a32661ca --- /dev/null +++ b/man/pop_het_exp.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pop_het_exp.R +\name{pop_het_exp} +\alias{pop_het_exp} +\alias{pop_gene_div} +\title{Compute the population expected heterozygosity} +\usage{ +pop_het_exp( + .x, + by_locus = FALSE, + include_global = FALSE, + n_cores = bigstatsr::nb_cores() +) + +pop_gene_div(...) +} +\arguments{ +\item{.x}{a \code{\link{gen_tibble}} (usually grouped, as obtained by using \code{\link[dplyr:group_by]{dplyr::group_by()}}, otherwise the full tibble +will be considered as belonging to a single population).} + +\item{by_locus}{boolean, determining whether Hs should be returned by locus(TRUE), +or as a single genome wide value (FALSE, the default).} + +\item{include_global}{boolean determining whether, besides the population specific estiamtes, a global +estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value), +or a matrix with n+1 columns if \code{by_locus=TRUE}.} + +\item{n_cores}{number of cores to be used, it defaults to \code{\link[bigstatsr:reexports]{bigstatsr::nb_cores()}}} +} +\value{ +a vector of mean population observed heterozygosities (if \code{by_locus=FALSE}), or a matrix of +estimates by locus (rows are loci, columns are populations, \code{by_locus=TRUE}) +} +\description{ +This function computes expected population heterozygosity (also +referred to as gene diversity, to avoid the potentially misleading use of the term "expected" in this context), using the formula of Nei (1987). +} +\details{ +Within population expected heterozygosity (gene diversity) is defined as: +\eqn{Hs=\tilde{n}/(\tilde{n}-1)[1-\sum_i\bar{p_i^2}-Ho/2\tilde{n}],} + +where \eqn{\tilde{n}=np/\sum_k 1/n_k} and +\eqn{\bar{p_i^2}=\sum_k p_{ki}^2/np} +following Nei(1987) in pp.164--5 +} +\references{ +Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press +} diff --git a/man/pop_het_obs.Rd b/man/pop_het_obs.Rd new file mode 100644 index 00000000..18a4a9ca --- /dev/null +++ b/man/pop_het_obs.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pop_het_obs.R +\name{pop_het_obs} +\alias{pop_het_obs} +\title{Compute the population observed heterozygosity} +\usage{ +pop_het_obs( + .x, + by_locus = FALSE, + include_global = FALSE, + n_cores = bigstatsr::nb_cores() +) +} +\arguments{ +\item{.x}{a \code{\link{gen_tibble}} (usually grouped, as obtained by using \code{\link[dplyr:group_by]{dplyr::group_by()}}, otherwise the full tibble +will be considered as belonging to a single population).} + +\item{by_locus}{boolean, determining whether Ho should be returned by locus(TRUE), +or as a single genome wide value (FALSE, the default).} + +\item{include_global}{boolean determining whether, besides the population specific estiamtes, a global +estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value), +or a matrix with n+1 columns if \code{by_locus=TRUE}.} + +\item{n_cores}{number of cores to be used, it defaults to \code{\link[bigstatsr:reexports]{bigstatsr::nb_cores()}}} +} +\value{ +a vector of mean population observed heterozygosities (if \code{by_locus=FALSE}), or a matrix of +estimates by locus (rows are loci, clumns are populations, \code{by_locus=TRUE}) +} +\description{ +This function computes population heterozygosity, using the formula of Nei (1987). +} +\details{ +Within population observed heterozygosity is defined as: +\eqn{Ho= 1-\sum_k \sum_i P_{k,ii}/n_p} +where \eqn{P_{k,ii}} represents the proportion of homozygote \eqn{i} in sample \eqn{k} and +\eqn{n_{p}} the number of samples. +following Nei(1987) in pp.164--5 +} +\references{ +Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press +} diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R index 58fb40bc..f5b6e89f 100644 --- a/tests/testthat/test_gen_tibble.R +++ b/tests/testthat/test_gen_tibble.R @@ -533,11 +533,11 @@ test_that("gt without population is valid",{ gt_load(file_names[1]) # will fail in functions that require grouping - expect_error(pop_fis(test_gt), ".x should be a grouped df") + expect_error(pop_fst(test_gt), ".x should be a grouped df") # functions that require grouping work after adding groups test_gt$groups <- c("A","A","B") test_gt <- test_gt %>% group_by(groups) - expect_equal(unname(pop_fis(test_gt)), c(-0.25, NaN)) + expect_equal(unname(pop_fst(test_gt)), c(-0, NaN)) # gen_tbl from vcf does not have population automatically vcf_path <- system.file("extdata/pop_b.vcf", package = "tidypopgen") diff --git a/tests/testthat/test_pop_basic_stats.R b/tests/testthat/test_pop_basic_stats.R index 7e1f8bbb..00bb169f 100644 --- a/tests/testthat/test_pop_basic_stats.R +++ b/tests/testthat/test_pop_basic_stats.R @@ -16,18 +16,59 @@ test_loci <- data.frame(name=paste0("rs",1:6), test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE) - -test_that("pairwise_pop_fst compute correctly",{ +test_that("pop_* basic stats functions work correctly",{ test_gt <- test_gt %>% dplyr::group_by(population) test_hier <- gt_as_hierfstat(test_gt) - # compare results against hierfstat for Nei87 (Nei86 does not correct for Ho - # when computing Ht, so it gives a different result) - basic_gt <- test_gt %>% hierfstat_basic_stats() - basic_hier <- hierfstat::basic.stats(test_hier) - # hiefstat values are rounded to 4 dp - expect_true(all.equal(basic_hier$perloc, basic_gt$perloc, check.attributes=FALSE)) - #pair_fst_locus <- test_gt %>% pairwise_pop_fst(by_locus = TRUE) + # observed heterozygosity by locus + test_het_obs <- test_gt %>% pop_het_obs(by_locus = TRUE) + expect_true(all.equal(basic_hier$Ho, round(test_het_obs, 4), check.attributes=FALSE)) + # overall (mean of by locus values) + test_het_obs <- test_gt %>% pop_het_obs(by_locus = FALSE) + expect_true(all.equal(colMeans(basic_hier$Ho, na.rm=TRUE), test_het_obs, check.attributes=FALSE)) + # check the overall value + test_het_obs <- test_gt %>% pop_het_obs(by_locus = TRUE, include_global = TRUE) + expect_true(all.equal(round(test_het_obs[,4], 4), basic_hier$perloc$Ho, check.attributes=FALSE)) + test_het_obs <- test_gt %>% pop_het_obs(by_locus = FALSE, include_global = TRUE) + expect_true(all.equal(round(test_het_obs[4], 4), basic_hier$overall["Ho"], check.attributes=FALSE)) + + # test expected heterozygosity by locus + test_het_exp <- test_gt %>% pop_het_exp(by_locus = TRUE) + expect_true(all.equal(basic_hier$Hs, round(test_het_exp, 4), check.attributes=FALSE)) + # overall (mean of by locus values) + test_het_exp <- test_gt %>% pop_het_exp(by_locus = FALSE) + expect_true(all.equal(colMeans(basic_hier$Hs, na.rm=TRUE), test_het_exp, check.attributes=FALSE)) + # check the overall value + test_het_exp <- test_gt %>% pop_het_exp(by_locus = TRUE, include_global = TRUE) + expect_true(all.equal(round(test_het_exp[,4], 4), basic_hier$perloc$Hs, check.attributes=FALSE)) + test_het_exp <- test_gt %>% pop_het_exp(by_locus = FALSE, include_global = TRUE) + expect_true(all.equal(round(test_het_exp[4], 4), basic_hier$overall["Hs"], check.attributes=FALSE)) + + # test Fis by locus + test_fis <- test_gt %>% pop_fis(by_locus = TRUE, method= "Nei87") + expect_true(all.equal(basic_hier$Fis, round(test_fis, 4), check.attributes=FALSE)) + # overall (mean of by locus values) + test_fis <- test_gt %>% pop_fis(by_locus = FALSE) + expect_true(all.equal(round(colMeans(basic_hier$Fis, na.rm=TRUE),4), round(test_fis,4), check.attributes=FALSE)) + # check the overall value + test_fis <- test_gt %>% pop_fis(by_locus = TRUE, include_global = TRUE) + expect_true(all.equal(round(test_fis[,4], 4), basic_hier$perloc$Fis, check.attributes=FALSE)) + test_fis <- test_gt %>% pop_fis(by_locus = FALSE, include_global = TRUE) + expect_true(all.equal(round(test_fis[4], 4), basic_hier$overall["Fis"], check.attributes=FALSE)) + + # test global stats by locus + test_global_stats <- test_gt %>% pop_global_stats(by_locus = TRUE) + expect_true(all.equal(basic_hier$perloc, round(test_global_stats, 4), check.attributes=FALSE)) + # test global stats overall + test_global_stats <- test_gt %>% pop_global_stats(by_locus = FALSE) + expect_true(all.equal(basic_hier$overall, round(test_global_stats, 4), check.attributes=FALSE)) + + # test Fis errors for incorrect parameters for Nei87 + expect_error(test_gt %>% pop_fis(by_locus = TRUE, method= "Nei87", allele_sharing_mat = matrix(1, nrow=7, ncol=7)), + "allele_sharing_mat not relevant for Nei87") + # test Fis errors for incorrect parameters for WG17 + expect_error(test_gt %>% pop_fis(by_locus = TRUE, method= "WG17", allele_sharing_mat = matrix(1, nrow=7, ncol=7)), + "by_locus not implemented for WG17") }) diff --git a/tests/testthat/test_pop_fis_fst.R b/tests/testthat/test_pop_fis_fst.R index 017dfdfd..142b088d 100644 --- a/tests/testthat/test_pop_fis_fst.R +++ b/tests/testthat/test_pop_fis_fst.R @@ -17,15 +17,15 @@ test_loci <- data.frame(name=paste0("rs",1:6), test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE) -test_that("pop_fst and pop_fist compute correctly",{ +test_that("pop_fst and pop_fist WG17 compute correctly",{ test_gt <- test_gt %>% dplyr::group_by(population) # compare results against raw hierfstat code - fis_by_pop <- test_gt %>% pop_fis(include_global=TRUE) + fis_by_pop <- test_gt %>% pop_fis(include_global=TRUE, method = "WG17") fis_by_pop_hier <- hierfstat::fis.dosage(test_genotypes,pop=test_indiv_meta$population) expect_true(all.equal(fis_by_pop, fis_by_pop_hier,check.attributes = FALSE)) # now check that we don't get the global - fis_by_pop_sub <- test_gt %>% pop_fis() + fis_by_pop_sub <- test_gt %>% pop_fis(method = "WG17") expect_true (all.equal(fis_by_pop[-length(fis_by_pop)], fis_by_pop_sub)) From c98c3363c7f5b5b65860fd7167aede1386cf9b40 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Thu, 10 Oct 2024 13:37:01 +0100 Subject: [PATCH 05/20] Some edits to formulae to match Nei's notation. Note that the explanation is still out of sync with what we do to obtain per pop estimates. --- R/pop_global_stats.R | 14 +++++++++++++- R/pop_het_exp.R | 11 ++++++----- R/pop_het_obs.R | 13 ++++++++----- man/pop_global_stats.Rd | 15 ++++++++++++++- man/pop_het_exp.Rd | 11 ++++++----- man/pop_het_obs.Rd | 12 +++++++----- 6 files changed, 54 insertions(+), 22 deletions(-) diff --git a/R/pop_global_stats.R b/R/pop_global_stats.R index dcffd63f..90a8dc25 100644 --- a/R/pop_global_stats.R +++ b/R/pop_global_stats.R @@ -1,6 +1,17 @@ #' Compute basic population global statistics #' -#' This function computes basic population global statistics, following the formulas used in `hietfstat::basic.stats`. +#' This function computes basic population global statistics, following the notation in Nei 1987 (which in turn is based on Nei and Chesser 1983): +#' #' - observed heterozygosity (`Ho`) +#' - expected heterozygosity (`Hs`) +#' - total heterozygosity (`Ht`) +#' - within population diversity (`Dst`) +#' - total population diversity (`Htp`) +#' - total population diversity (`Dstp`) +#' - Fst +#' - Fst prime +#' - Fis +#' - Dest +#' following the formulas used in `hietfstat::basic.stats`. #' #' @details TO BE CORRECTED (THIS IS INCOMPLETE, AND FORMULAE WERE FILLED IN BY COPILOT) #' The statistics computed are: @@ -14,6 +25,7 @@ #' - Fst prime #' - Fis #' - Dest +#' #' @details The formulas used are: #' \eqn{Ho= 1-\sum_k \sum_i P_{k,ii}/n_p} #' where \eqn{P_{k,ii}} represents the proportion of homozygote \eqn{i} in sample \eqn{k} and diff --git a/R/pop_het_exp.R b/R/pop_het_exp.R index d434cc23..3c6c734a 100644 --- a/R/pop_het_exp.R +++ b/R/pop_het_exp.R @@ -3,12 +3,13 @@ #' This function computes expected population heterozygosity (also #' referred to as gene diversity, to avoid the potentially misleading use of the term "expected" in this context), using the formula of Nei (1987). #' -#' @details Within population expected heterozygosity (gene diversity) is defined as: -#' \eqn{Hs=\tilde{n}/(\tilde{n}-1)[1-\sum_i\bar{p_i^2}-Ho/2\tilde{n}],} +#' @details Within population expected heterozygosity (gene diversity) \eqn{\hat{h}_s} for a locus with \eqn{m} alleles is defined as:\cr +#' \eqn{\hat{h}_s=\tilde{n}/(\tilde{n}-1)[1-\sum_{i}^{m}\bar{\hat{x}_i^2}-\hat{h}_o/2\tilde{n}]}\cr #' -#' where \eqn{\tilde{n}=np/\sum_k 1/n_k} and -#' \eqn{\bar{p_i^2}=\sum_k p_{ki}^2/np} -#' following Nei(1987) in pp.164--5 +#' where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) and\cr +#' \eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr +#' following equation 7.39 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. \eqn{\hat{h}_s} at +#' the genome level for each population is simply the mean of the locus estimates for each population. #' #' @references Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press #' diff --git a/R/pop_het_obs.R b/R/pop_het_obs.R index c0e180c8..5c8d4082 100644 --- a/R/pop_het_obs.R +++ b/R/pop_het_obs.R @@ -2,11 +2,14 @@ #' #' This function computes population heterozygosity, using the formula of Nei (1987). #' -#' @details Within population observed heterozygosity is defined as: -#' \eqn{Ho= 1-\sum_k \sum_i P_{k,ii}/n_p} -#' where \eqn{P_{k,ii}} represents the proportion of homozygote \eqn{i} in sample \eqn{k} and -#' \eqn{n_{p}} the number of samples. -#' following Nei(1987) in pp.164--5 +#' @details Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr +#' \eqn{\hat{h}_o= 1-\sum_{k=1}^{s} \sum_{i=1}^{m} \hat{X}_{kii}/s}\cr +#' where\cr +#' \eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr +#' \eqn{s} the number of populations,\cr +#' following equation 7.38 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. \eqn{\hat{h}_o} at +#' the genome level is simply the mean of the locus estimates. The global estimate for each locus is the (unweighted) mean of the population estimates, +#' and its genome-wide value is the mean of the global locus estimates. #' #' @references Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press #' diff --git a/man/pop_global_stats.Rd b/man/pop_global_stats.Rd index ff0e002b..5c0bfc4f 100644 --- a/man/pop_global_stats.Rd +++ b/man/pop_global_stats.Rd @@ -19,7 +19,20 @@ or as a single genome wide value (FALSE, the default).} a tibble of population statistics, with populations as rows and statistics as columns } \description{ -This function computes basic population global statistics, following the formulas used in \code{hietfstat::basic.stats}. +This function computes basic population global statistics, following the notation in Nei 1987 (which in turn is based on Nei and Chesser 1983): +#' - observed heterozygosity (\code{Ho}) +\itemize{ +\item expected heterozygosity (\code{Hs}) +\item total heterozygosity (\code{Ht}) +\item within population diversity (\code{Dst}) +\item total population diversity (\code{Htp}) +\item total population diversity (\code{Dstp}) +\item Fst +\item Fst prime +\item Fis +\item Dest +following the formulas used in \code{hietfstat::basic.stats}. +} } \details{ TO BE CORRECTED (THIS IS INCOMPLETE, AND FORMULAE WERE FILLED IN BY COPILOT) diff --git a/man/pop_het_exp.Rd b/man/pop_het_exp.Rd index a32661ca..f2570ac4 100644 --- a/man/pop_het_exp.Rd +++ b/man/pop_het_exp.Rd @@ -36,12 +36,13 @@ This function computes expected population heterozygosity (also referred to as gene diversity, to avoid the potentially misleading use of the term "expected" in this context), using the formula of Nei (1987). } \details{ -Within population expected heterozygosity (gene diversity) is defined as: -\eqn{Hs=\tilde{n}/(\tilde{n}-1)[1-\sum_i\bar{p_i^2}-Ho/2\tilde{n}],} +Within population expected heterozygosity (gene diversity) \eqn{\hat{h}_s} for a locus with \eqn{m} alleles is defined as:\cr +\eqn{\hat{h}_s=\tilde{n}/(\tilde{n}-1)[1-\sum_{i}^{m}\bar{\hat{x}_i^2}-\hat{h}_o/2\tilde{n}]}\cr -where \eqn{\tilde{n}=np/\sum_k 1/n_k} and -\eqn{\bar{p_i^2}=\sum_k p_{ki}^2/np} -following Nei(1987) in pp.164--5 +where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) and\cr +\eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr +following equation 7.39 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. \eqn{\hat{h}_s} at +the genome level for each population is simply the mean of the locus estimates for each population. } \references{ Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press diff --git a/man/pop_het_obs.Rd b/man/pop_het_obs.Rd index 18a4a9ca..5c1b1be0 100644 --- a/man/pop_het_obs.Rd +++ b/man/pop_het_obs.Rd @@ -32,11 +32,13 @@ estimates by locus (rows are loci, clumns are populations, \code{by_locus=TRUE}) This function computes population heterozygosity, using the formula of Nei (1987). } \details{ -Within population observed heterozygosity is defined as: -\eqn{Ho= 1-\sum_k \sum_i P_{k,ii}/n_p} -where \eqn{P_{k,ii}} represents the proportion of homozygote \eqn{i} in sample \eqn{k} and -\eqn{n_{p}} the number of samples. -following Nei(1987) in pp.164--5 +Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr +\eqn{\hat{h}_o= 1-\sum_{k=1}^{s} \sum_{i=1}^{m} \hat{X}_{kii}/s}\cr +where\cr +\eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr +\eqn{s} the number of populations,\cr +following equation 7.38 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. \eqn{\hat{h}_o} at +the genome level is simply the mean of the locus estimates. } \references{ Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press From 2fa286ee77a480359256422055e2c7dc5cf4d2c8 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Fri, 11 Oct 2024 08:30:14 +0100 Subject: [PATCH 06/20] more docs editings Still a way to go. --- R/pop_global_stats.R | 39 ++++++++++++++++----------------------- man/pop_global_stats.Rd | 35 +++++++++++++++++------------------ man/pop_het_obs.Rd | 3 ++- 3 files changed, 35 insertions(+), 42 deletions(-) diff --git a/R/pop_global_stats.R b/R/pop_global_stats.R index 90a8dc25..bbff7500 100644 --- a/R/pop_global_stats.R +++ b/R/pop_global_stats.R @@ -1,9 +1,9 @@ #' Compute basic population global statistics #' #' This function computes basic population global statistics, following the notation in Nei 1987 (which in turn is based on Nei and Chesser 1983): -#' #' - observed heterozygosity (`Ho`) -#' - expected heterozygosity (`Hs`) -#' - total heterozygosity (`Ht`) +#' - observed heterozygosity (\eqn{h_o}, column header `Ho`) +#' - expected heterozygosity (\eqn{h_s}, `Hs`) +#' - total heterozygosity (\eqn{h_t}, `Ht`) #' - within population diversity (`Dst`) #' - total population diversity (`Htp`) #' - total population diversity (`Dstp`) @@ -11,28 +11,21 @@ #' - Fst prime #' - Fis #' - Dest -#' following the formulas used in `hietfstat::basic.stats`. +#' @details We use the notation of Nei 1987. That notation was for loci with `m` alleles, but in our case we only have two alleles, so `m=2`. +#' Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr +#' \eqn{\hat{h}_o= 1-\sum_{k=1}^{s} \sum_{i=1}^{m} \hat{X}_{kii}/s}\cr +#' where\cr +#' \eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr +#' \eqn{s} the number of populations,\cr +#' following equation 7.38 in Nei(1987) on pp.164.\cr #' -#' @details TO BE CORRECTED (THIS IS INCOMPLETE, AND FORMULAE WERE FILLED IN BY COPILOT) -#' The statistics computed are: -#' - observed heterozygosity (`Ho`) -#' - expected heterozygosity (`Hs`) -#' - total heterozygosity (`Ht`) -#' - within population diversity (`Dst`) -#' - total population diversity (`Htp`) -#' - total population diversity (`Dstp`) -#' - Fst -#' - Fst prime -#' - Fis -#' - Dest +#' Within population expected heterozygosity (gene diversity) \eqn{\hat{h}_s} for a locus with \eqn{m} alleles is defined as:\cr +#' \eqn{\hat{h}_s=\tilde{n}/(\tilde{n}-1)[1-\sum_{i}^{m}\bar{\hat{x}_i^2}-\hat{h}_o/2\tilde{n}]}\cr +#' +#' where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) and\cr +#' \eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr +#' following equation 7.39 in Nei(1987) on pp.164. #' -#' @details The formulas used are: -#' \eqn{Ho= 1-\sum_k \sum_i P_{k,ii}/n_p} -#' where \eqn{P_{k,ii}} represents the proportion of homozygote \eqn{i} in sample \eqn{k} and -#' \eqn{n_{p}} the number of samples. -#' \eqn{Hs=\tilde{n}/(\tilde{n}-1)[1-\sum_i\bar{p_i^2}-Ho/2\tilde{n}]} -#' where \eqn{\tilde{n}=np/\sum_k 1/n_k} and -#' \eqn{\bar{p_i^2}=\sum_k p_{ki}^2/np} #' \eqn{Ht=1-\sum_k \bar{p_k^2} + \bar{Hs}/\tilde{n}/np - Ho/2/\tilde{n}/np} #' \eqn{Dst=Ht - Hs} #' diff --git a/man/pop_global_stats.Rd b/man/pop_global_stats.Rd index 5c0bfc4f..94a15fff 100644 --- a/man/pop_global_stats.Rd +++ b/man/pop_global_stats.Rd @@ -20,10 +20,10 @@ a tibble of population statistics, with populations as rows and statistics as co } \description{ This function computes basic population global statistics, following the notation in Nei 1987 (which in turn is based on Nei and Chesser 1983): -#' - observed heterozygosity (\code{Ho}) \itemize{ -\item expected heterozygosity (\code{Hs}) -\item total heterozygosity (\code{Ht}) +\item observed heterozygosity (\eqn{h_o}, column header \code{Ho}) +\item expected heterozygosity (\eqn{h_s}, \code{Hs}) +\item total heterozygosity (\eqn{h_t}, \code{Ht}) \item within population diversity (\code{Dst}) \item total population diversity (\code{Htp}) \item total population diversity (\code{Dstp}) @@ -31,24 +31,23 @@ This function computes basic population global statistics, following the notatio \item Fst prime \item Fis \item Dest -following the formulas used in \code{hietfstat::basic.stats}. } } \details{ -TO BE CORRECTED (THIS IS INCOMPLETE, AND FORMULAE WERE FILLED IN BY COPILOT) -The statistics computed are: -\itemize{ -\item observed heterozygosity (\code{Ho}) -\item expected heterozygosity (\code{Hs}) -\item total heterozygosity (\code{Ht}) -\item within population diversity (\code{Dst}) -\item total population diversity (\code{Htp}) -\item total population diversity (\code{Dstp}) -\item Fst -\item Fst prime -\item Fis -\item Dest -} +We use the notation of Nei 1987. That notation was for loci with \code{m} alleles, but in our case we only have two alleles, so \code{m=2}. +Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr +\eqn{\hat{h}_o= 1-\sum_{k=1}^{s} \sum_{i=1}^{m} \hat{X}_{kii}/s}\cr +where\cr +\eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr +\eqn{s} the number of populations,\cr +following equation 7.38 in Nei(1987) on pp.164.\cr + +Within population expected heterozygosity (gene diversity) \eqn{\hat{h}_s} for a locus with \eqn{m} alleles is defined as:\cr +\eqn{\hat{h}_s=\tilde{n}/(\tilde{n}-1)[1-\sum_{i}^{m}\bar{\hat{x}_i^2}-\hat{h}_o/2\tilde{n}]}\cr + +where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) and\cr +\eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr +following equation 7.39 in Nei(1987) on pp.164. The formulas used are: \eqn{Ho= 1-\sum_k \sum_i P_{k,ii}/n_p} diff --git a/man/pop_het_obs.Rd b/man/pop_het_obs.Rd index 5c1b1be0..fa2b5b2b 100644 --- a/man/pop_het_obs.Rd +++ b/man/pop_het_obs.Rd @@ -38,7 +38,8 @@ where\cr \eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr \eqn{s} the number of populations,\cr following equation 7.38 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. \eqn{\hat{h}_o} at -the genome level is simply the mean of the locus estimates. +the genome level is simply the mean of the locus estimates. The global estimate for each locus is the (unweighted) mean of the population estimates, +and its genome-wide value is the mean of the global locus estimates. } \references{ Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press From 9a060bdae3a935ffc808cc1c9ea6d40347f7ceed Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Fri, 11 Oct 2024 08:57:53 +0100 Subject: [PATCH 07/20] A few more edits --- R/pop_het_exp.R | 2 +- R/pop_het_obs.R | 6 +++--- man/pop_global_stats.Rd | 7 ------- man/pop_het_exp.Rd | 2 +- man/pop_het_obs.Rd | 6 +++--- 5 files changed, 8 insertions(+), 15 deletions(-) diff --git a/R/pop_het_exp.R b/R/pop_het_exp.R index 3c6c734a..18853ab6 100644 --- a/R/pop_het_exp.R +++ b/R/pop_het_exp.R @@ -8,7 +8,7 @@ #' #' where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) and\cr #' \eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr -#' following equation 7.39 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. \eqn{\hat{h}_s} at +#' following equation 7.39 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. \eqn{\hat{h}_s} at #' the genome level for each population is simply the mean of the locus estimates for each population. #' #' @references Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press diff --git a/R/pop_het_obs.R b/R/pop_het_obs.R index 5c8d4082..ff3d2fef 100644 --- a/R/pop_het_obs.R +++ b/R/pop_het_obs.R @@ -7,9 +7,9 @@ #' where\cr #' \eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr #' \eqn{s} the number of populations,\cr -#' following equation 7.38 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. \eqn{\hat{h}_o} at -#' the genome level is simply the mean of the locus estimates. The global estimate for each locus is the (unweighted) mean of the population estimates, -#' and its genome-wide value is the mean of the global locus estimates. +#' following equation 7.38 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. For +#' popoulation specific estimates, the sum is done over a single value of \eqn{k}. \eqn{\hat{h}_o} at +#' the genome level is simply the mean of the locus estimates. #' #' @references Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press #' diff --git a/man/pop_global_stats.Rd b/man/pop_global_stats.Rd index 94a15fff..cfc7f424 100644 --- a/man/pop_global_stats.Rd +++ b/man/pop_global_stats.Rd @@ -49,13 +49,6 @@ where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) an \eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr following equation 7.39 in Nei(1987) on pp.164. -The formulas used are: -\eqn{Ho= 1-\sum_k \sum_i P_{k,ii}/n_p} -where \eqn{P_{k,ii}} represents the proportion of homozygote \eqn{i} in sample \eqn{k} and -\eqn{n_{p}} the number of samples. -\eqn{Hs=\tilde{n}/(\tilde{n}-1)[1-\sum_i\bar{p_i^2}-Ho/2\tilde{n}]} -where \eqn{\tilde{n}=np/\sum_k 1/n_k} and -\eqn{\bar{p_i^2}=\sum_k p_{ki}^2/np} \eqn{Ht=1-\sum_k \bar{p_k^2} + \bar{Hs}/\tilde{n}/np - Ho/2/\tilde{n}/np} \eqn{Dst=Ht - Hs} } diff --git a/man/pop_het_exp.Rd b/man/pop_het_exp.Rd index f2570ac4..6e5adb5e 100644 --- a/man/pop_het_exp.Rd +++ b/man/pop_het_exp.Rd @@ -41,7 +41,7 @@ Within population expected heterozygosity (gene diversity) \eqn{\hat{h}_s} for where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) and\cr \eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr -following equation 7.39 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. \eqn{\hat{h}_s} at +following equation 7.39 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. \eqn{\hat{h}_s} at the genome level for each population is simply the mean of the locus estimates for each population. } \references{ diff --git a/man/pop_het_obs.Rd b/man/pop_het_obs.Rd index fa2b5b2b..41d777a8 100644 --- a/man/pop_het_obs.Rd +++ b/man/pop_het_obs.Rd @@ -37,9 +37,9 @@ Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{ where\cr \eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr \eqn{s} the number of populations,\cr -following equation 7.38 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. \eqn{\hat{h}_o} at -the genome level is simply the mean of the locus estimates. The global estimate for each locus is the (unweighted) mean of the population estimates, -and its genome-wide value is the mean of the global locus estimates. +following equation 7.38 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. For +popoulation specific estimates, the sum is done over a single value of \eqn{k}. \eqn{\hat{h}_o} at +the genome level is simply the mean of the locus estimates. } \references{ Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press From e233d797fc03f29ba4fe56011ab3291c894d7038 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Fri, 11 Oct 2024 11:04:20 +0100 Subject: [PATCH 08/20] typos [skip ci] --- R/pop_het_obs.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pop_het_obs.R b/R/pop_het_obs.R index ff3d2fef..08b64408 100644 --- a/R/pop_het_obs.R +++ b/R/pop_het_obs.R @@ -8,7 +8,7 @@ #' \eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr #' \eqn{s} the number of populations,\cr #' following equation 7.38 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. For -#' popoulation specific estimates, the sum is done over a single value of \eqn{k}. \eqn{\hat{h}_o} at +#' population specific estimates, the sum is done over a single value of \eqn{k}. \eqn{\hat{h}_o} at #' the genome level is simply the mean of the locus estimates. #' #' @references Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press @@ -22,7 +22,7 @@ #' or a matrix with n+1 columns if `by_locus=TRUE`. #' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()] #' @returns a vector of mean population observed heterozygosities (if `by_locus=FALSE`), or a matrix of -#' estimates by locus (rows are loci, clumns are populations, `by_locus=TRUE`) +#' estimates by locus (rows are loci, columns are populations, `by_locus=TRUE`) #' @export From 3282ccbd01216a8bc3733bf4534c399b57d89ff6 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Fri, 11 Oct 2024 17:11:41 +0100 Subject: [PATCH 09/20] more equations --- R/pop_fis.R | 4 ++-- R/pop_global_stats.R | 21 +++++++++++++-------- man/pop_fis.Rd | 4 ++-- man/pop_global_stats.Rd | 25 +++++++++++++++---------- man/pop_het_obs.Rd | 4 ++-- 5 files changed, 34 insertions(+), 24 deletions(-) diff --git a/R/pop_fis.R b/R/pop_fis.R index 109a1b55..deb49cd7 100644 --- a/R/pop_fis.R +++ b/R/pop_fis.R @@ -1,12 +1,12 @@ #' Compute population specific FIS #' -#' This function computes population specific FIS, using either the approach of Nei 1987 ((as computed by [hierfstat::basic.stats()]).) or of Weir and Goudet 2017 +#' This function computes population specific FIS, using either the approach of Nei 1987 (as computed by [hierfstat::basic.stats()]) or of Weir and Goudet 2017 #' (as computed by [hierfstat::fis.dosage()]). #' @references #' Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press #' Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085 #' @param .x a grouped [`gen_tibble`] (as obtained by using [dplyr::group_by()]) -#' @param method one of "Nei87" (based on Nei 1987) or "WG17" (for Weir and Goudet 2017) to compute FIS +#' @param method one of "Nei87" (based on Nei 1987, eqn 7.41) or "WG17" (for Weir and Goudet 2017) to compute FIS #' @param by_locus boolean, determining whether FIS should be returned by locus(TRUE), #' or as a single genome wide value (FALSE, the default). Note that this is only relevant for "Nei87", #' as "WG17" always returns a single value. diff --git a/R/pop_global_stats.R b/R/pop_global_stats.R index bbff7500..73e5ddd7 100644 --- a/R/pop_global_stats.R +++ b/R/pop_global_stats.R @@ -2,7 +2,7 @@ #' #' This function computes basic population global statistics, following the notation in Nei 1987 (which in turn is based on Nei and Chesser 1983): #' - observed heterozygosity (\eqn{h_o}, column header `Ho`) -#' - expected heterozygosity (\eqn{h_s}, `Hs`) +#' - expected heterozygosity, also known as gene diversity (\eqn{h_s}, `Hs`) #' - total heterozygosity (\eqn{h_t}, `Ht`) #' - within population diversity (`Dst`) #' - total population diversity (`Htp`) @@ -11,24 +11,29 @@ #' - Fst prime #' - Fis #' - Dest -#' @details We use the notation of Nei 1987. That notation was for loci with `m` alleles, but in our case we only have two alleles, so `m=2`. -#' Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr +#' @details We use the notation of Nei 1987. That notation was for loci with \eqn{m} alleles, but in our case we only have two alleles, so `m=2`. +#' - Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr #' \eqn{\hat{h}_o= 1-\sum_{k=1}^{s} \sum_{i=1}^{m} \hat{X}_{kii}/s}\cr #' where\cr #' \eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr #' \eqn{s} the number of populations,\cr #' following equation 7.38 in Nei(1987) on pp.164.\cr #' -#' Within population expected heterozygosity (gene diversity) \eqn{\hat{h}_s} for a locus with \eqn{m} alleles is defined as:\cr -#' \eqn{\hat{h}_s=\tilde{n}/(\tilde{n}-1)[1-\sum_{i}^{m}\bar{\hat{x}_i^2}-\hat{h}_o/2\tilde{n}]}\cr -#' +#' - Within population expected heterozygosity (gene diversity) \eqn{\hat{h}_s} for a locus with \eqn{m} alleles is defined as:\cr +#' \eqn{\hat{h}_s=(\tilde{n}/(\tilde{n}-1))[1-\sum_{i=1}^{m}\bar{\hat{x}_i^2}-\hat{h}_o/2\tilde{n}]}\cr #' where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) and\cr #' \eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr #' following equation 7.39 in Nei(1987) on pp.164. #' -#' \eqn{Ht=1-\sum_k \bar{p_k^2} + \bar{Hs}/\tilde{n}/np - Ho/2/\tilde{n}/np} -#' \eqn{Dst=Ht - Hs} +#' - Total heterozygosity (total gene diversity) \eqn{\hat{h}_t} for a locus with \eqn{m} alleles is defined as:\cr +#' \eqn{\hat{h}_t = 1-\sum_{i=1}^{m} \bar{\hat{x}_i^2} + \hat{h}_s/(\tilde{n}s) - \hat{h}_o/(2\tilde{n}s)}\cr +#' where \cr +#' \eqn{\hat{x}_i=\sum_k \hat{x}_{ki}/s}\cr +#' following equation 7.40 in Nei(1987) on pp.164.\cr +#' - The amount of gene diversity among samples \eqn{Dst} is defined as:\cr +#' \eqn{Dst=Ht - Hs}\cr #' +#' - Total corrected heterozygosity (total gene diversity) \eqn{\hat{h}_t} #' #' @param .x a [`gen_tibble`] (usually grouped, as obtained by using [dplyr::group_by()]; use on #' a single population will return a number of quantities as NA/NaN) diff --git a/man/pop_fis.Rd b/man/pop_fis.Rd index 923bc984..5a162189 100644 --- a/man/pop_fis.Rd +++ b/man/pop_fis.Rd @@ -15,7 +15,7 @@ pop_fis( \arguments{ \item{.x}{a grouped \code{\link{gen_tibble}} (as obtained by using \code{\link[dplyr:group_by]{dplyr::group_by()}})} -\item{method}{one of "Nei87" (based on Nei 1987) or "WG17" (for Weir and Goudet 2017) to compute FIS} +\item{method}{one of "Nei87" (based on Nei 1987, eqn 7.41) or "WG17" (for Weir and Goudet 2017) to compute FIS} \item{by_locus}{boolean, determining whether FIS should be returned by locus(TRUE), or as a single genome wide value (FALSE, the default). Note that this is only relevant for "Nei87", @@ -34,7 +34,7 @@ it it sometimes more efficient to pre-compute this matrix.} a vector of population specific fis (plus the global value if \code{include_global=TRUE}) } \description{ -This function computes population specific FIS, using either the approach of Nei 1987 ((as computed by \code{\link[hierfstat:basic.stats]{hierfstat::basic.stats()}}).) or of Weir and Goudet 2017 +This function computes population specific FIS, using either the approach of Nei 1987 (as computed by \code{\link[hierfstat:basic.stats]{hierfstat::basic.stats()}}) or of Weir and Goudet 2017 (as computed by \code{\link[hierfstat:fs.dosage]{hierfstat::fis.dosage()}}). } \references{ diff --git a/man/pop_global_stats.Rd b/man/pop_global_stats.Rd index cfc7f424..14562196 100644 --- a/man/pop_global_stats.Rd +++ b/man/pop_global_stats.Rd @@ -22,7 +22,7 @@ a tibble of population statistics, with populations as rows and statistics as co This function computes basic population global statistics, following the notation in Nei 1987 (which in turn is based on Nei and Chesser 1983): \itemize{ \item observed heterozygosity (\eqn{h_o}, column header \code{Ho}) -\item expected heterozygosity (\eqn{h_s}, \code{Hs}) +\item expected heterozygosity, also known as gene diversity (\eqn{h_s}, \code{Hs}) \item total heterozygosity (\eqn{h_t}, \code{Ht}) \item within population diversity (\code{Dst}) \item total population diversity (\code{Htp}) @@ -34,21 +34,26 @@ This function computes basic population global statistics, following the notatio } } \details{ -We use the notation of Nei 1987. That notation was for loci with \code{m} alleles, but in our case we only have two alleles, so \code{m=2}. -Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr +We use the notation of Nei 1987. That notation was for loci with \eqn{m} alleles, but in our case we only have two alleles, so \code{m=2}. +\itemize{ +\item Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr \eqn{\hat{h}_o= 1-\sum_{k=1}^{s} \sum_{i=1}^{m} \hat{X}_{kii}/s}\cr where\cr \eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr \eqn{s} the number of populations,\cr following equation 7.38 in Nei(1987) on pp.164.\cr - -Within population expected heterozygosity (gene diversity) \eqn{\hat{h}_s} for a locus with \eqn{m} alleles is defined as:\cr -\eqn{\hat{h}_s=\tilde{n}/(\tilde{n}-1)[1-\sum_{i}^{m}\bar{\hat{x}_i^2}-\hat{h}_o/2\tilde{n}]}\cr - +\item Within population expected heterozygosity (gene diversity) \eqn{\hat{h}_s} for a locus with \eqn{m} alleles is defined as:\cr +\eqn{\hat{h}_s=(\tilde{n}/(\tilde{n}-1))[1-\sum_{i=1}^{m}\bar{\hat{x}_i^2}-\hat{h}_o/2\tilde{n}]}\cr where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) and\cr \eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr following equation 7.39 in Nei(1987) on pp.164. - -\eqn{Ht=1-\sum_k \bar{p_k^2} + \bar{Hs}/\tilde{n}/np - Ho/2/\tilde{n}/np} -\eqn{Dst=Ht - Hs} +\item Total heterozygosity (total gene diversity) \eqn{\hat{h}_t} for a locus with \eqn{m} alleles is defined as:\cr +\eqn{\hat{h}_t = 1-\sum_{i=1}^{m} \bar{\hat{x}_i^2} + \hat{h}_s/(\tilde{n}s) - \hat{h}_o/(2\tilde{n}s)}\cr +where \cr +\eqn{\hat{x}_i=\sum_k \hat{x}_{ki}/s}\cr +following equation 7.40 in Nei(1987) on pp.164.\cr +\item The amount of gene diversity among samples \eqn{Dst} is defined as:\cr +\eqn{Dst=Ht - Hs}\cr +\item Total corrected heterozygosity (total gene diversity) \eqn{\hat{h}_t} +} } diff --git a/man/pop_het_obs.Rd b/man/pop_het_obs.Rd index 41d777a8..c5848578 100644 --- a/man/pop_het_obs.Rd +++ b/man/pop_het_obs.Rd @@ -26,7 +26,7 @@ or a matrix with n+1 columns if \code{by_locus=TRUE}.} } \value{ a vector of mean population observed heterozygosities (if \code{by_locus=FALSE}), or a matrix of -estimates by locus (rows are loci, clumns are populations, \code{by_locus=TRUE}) +estimates by locus (rows are loci, columns are populations, \code{by_locus=TRUE}) } \description{ This function computes population heterozygosity, using the formula of Nei (1987). @@ -38,7 +38,7 @@ where\cr \eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr \eqn{s} the number of populations,\cr following equation 7.38 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. For -popoulation specific estimates, the sum is done over a single value of \eqn{k}. \eqn{\hat{h}_o} at +population specific estimates, the sum is done over a single value of \eqn{k}. \eqn{\hat{h}_o} at the genome level is simply the mean of the locus estimates. } \references{ From a624f5d638e376b8ab672e8886a1061eb3a17a89 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Fri, 18 Oct 2024 20:14:16 +0100 Subject: [PATCH 10/20] Complete documentation of various stats --- R/pop_global_stats.R | 58 ++++++++++++++++++++++++++++++++--------- R/pop_het_exp.R | 7 +++-- man/pop_global_stats.Rd | 53 ++++++++++++++++++++++++++++--------- man/pop_het_exp.Rd | 7 ++++- 4 files changed, 96 insertions(+), 29 deletions(-) diff --git a/R/pop_global_stats.R b/R/pop_global_stats.R index 73e5ddd7..9941c0a5 100644 --- a/R/pop_global_stats.R +++ b/R/pop_global_stats.R @@ -1,16 +1,17 @@ #' Compute basic population global statistics #' #' This function computes basic population global statistics, following the notation in Nei 1987 (which in turn is based on Nei and Chesser 1983): -#' - observed heterozygosity (\eqn{h_o}, column header `Ho`) -#' - expected heterozygosity, also known as gene diversity (\eqn{h_s}, `Hs`) -#' - total heterozygosity (\eqn{h_t}, `Ht`) -#' - within population diversity (`Dst`) -#' - total population diversity (`Htp`) -#' - total population diversity (`Dstp`) -#' - Fst -#' - Fst prime -#' - Fis -#' - Dest +#' - observed heterozygosity ( \eqn{\hat{h}_o}, column header `Ho`) +#' - expected heterozygosity, also known as gene diversity ( \eqn{\hat{h}_s}, `Hs`) +#' - total heterozygosity ( \eqn{\hat{h}_t}, `Ht`) +#' - genetic differentiation between subpopulations (\eqn{D_{st}}, `Dst`) +#' - corrected total population diversity (\eqn{h'_t}, `Htp`) +#' - corrected genetic differentiation between subpopulations (\eqn{D'_{st}}, `Dstp`) +#' - \eqn{\hat{F}_{ST}} (column header, `Fst`) +#' - corrected \eqn{\hat{F'}_{ST}} (column header `Fstp`) +#' - \eqn{\hat{F}_{IS}} (column header, `Fis`) +#' - Jost's \eqn{\hat{D}} (column header, `Dest`) +#' #' @details We use the notation of Nei 1987. That notation was for loci with \eqn{m} alleles, but in our case we only have two alleles, so `m=2`. #' - Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr #' \eqn{\hat{h}_o= 1-\sum_{k=1}^{s} \sum_{i=1}^{m} \hat{X}_{kii}/s}\cr @@ -30,10 +31,41 @@ #' where \cr #' \eqn{\hat{x}_i=\sum_k \hat{x}_{ki}/s}\cr #' following equation 7.40 in Nei(1987) on pp.164.\cr -#' - The amount of gene diversity among samples \eqn{Dst} is defined as:\cr -#' \eqn{Dst=Ht - Hs}\cr #' -#' - Total corrected heterozygosity (total gene diversity) \eqn{\hat{h}_t} +#' - The amount of gene diversity among samples \eqn{D_{ST}} is defined as:\cr +#' \eqn{D_{ST} = \hat{h}_t - \hat{h}_s}\cr +#' following the equation provided in the text at the top of page 165 in Nei(1987). +#' +#' - The corrected amount of gene diversity among samples \eqn{D'_{ST}} is defined as:\cr +#' \eqn{D'_{ST} = (s/(s-1))D'_{ST}}\cr +#' following the equation provided in the text at the top of page 165 in Nei(1987). +#' +#' - Total corrected heterozygosity (total gene diversity) \eqn{\hat{h}_t} is defined as:\cr +#' \eqn{\hat{h'}_t = \hat{h}_s + D'_{ST}}\cr +#' following the equation provided in the text at the top of page 165 in Nei(1987). +#' - \eqn{\hat{F}_{IS}} is defined as:\cr +#' \eqn{\hat{F}_{IS} = 1 - \hat{h}_o/\hat{h}_s}\cr +#' following equation 7.41 in Nei(1987) on pp.164.\cr +#' +#' - \eqn{\hat{F}_{ST}} is defined as:\cr +#' \eqn{\hat{F}_{ST} = 1 - \hat{h}_s/\hat{h}_t = D_{ST}/\hat{h}_t}\cr +#' following equation 7.43 in Nei(1987) on pp.165.\cr +#' +#' - \eqn{\hat{F'}_{ST}} is defined as:\cr +#' \eqn{\hat{F'}_{ST} = D'_{ST}/\hat{h'}_t}\cr +#' following the explanation provided in the text at the top of page 165 in Nei(1987). +#' +#' - Jost's \eqn{\hat{D}} is defined as:\cr +#' \eqn{\hat{D} = (s/(s-1))((\hat{h'}_t-\hat{h}_s)/(1-\hat{h}_s))}\cr +#' as defined by Jost(2008) +#' +#' All these statistics are first computed by locus, and then averaged across loci (including any +#' monorphic locus) to obtain genome-wide values. The function uses the same algorithm as +#' [hierfstat::basic.stats()] but is optimized for speed and memory usage. +#' +#' @references Nei M, Chesser R (1983) Estimation of fixation indexes and gene diversities. Annals of Human Genetics, 47, 253-259. +#' Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press, pp. 164-165. +#' Jost L (2008) GST and its relatives do not measure differentiation. Molecular Ecology, 17, 4015-4026. #' #' @param .x a [`gen_tibble`] (usually grouped, as obtained by using [dplyr::group_by()]; use on #' a single population will return a number of quantities as NA/NaN) diff --git a/R/pop_het_exp.R b/R/pop_het_exp.R index 18853ab6..83aa5a7b 100644 --- a/R/pop_het_exp.R +++ b/R/pop_het_exp.R @@ -70,6 +70,9 @@ pop_het_exp <- function(.x, by_locus = FALSE, include_global = FALSE, n_cores = # alias for gene diversity #' @export #' @rdname pop_het_exp -pop_gene_div <- function (...){ - pop_het_exp(...) +pop_gene_div <- function (.x, by_locus = FALSE, include_global = FALSE, n_cores = bigstatsr::nb_cores()){ + pop_het_exp(.x=.x, + by_locus = by_locus, + include_global = include_global, + n_cores = n_cores) } diff --git a/man/pop_global_stats.Rd b/man/pop_global_stats.Rd index 14562196..5449f96a 100644 --- a/man/pop_global_stats.Rd +++ b/man/pop_global_stats.Rd @@ -21,16 +21,16 @@ a tibble of population statistics, with populations as rows and statistics as co \description{ This function computes basic population global statistics, following the notation in Nei 1987 (which in turn is based on Nei and Chesser 1983): \itemize{ -\item observed heterozygosity (\eqn{h_o}, column header \code{Ho}) -\item expected heterozygosity, also known as gene diversity (\eqn{h_s}, \code{Hs}) -\item total heterozygosity (\eqn{h_t}, \code{Ht}) -\item within population diversity (\code{Dst}) -\item total population diversity (\code{Htp}) -\item total population diversity (\code{Dstp}) -\item Fst -\item Fst prime -\item Fis -\item Dest +\item observed heterozygosity ( \eqn{\hat{h}_o}, column header \code{Ho}) +\item expected heterozygosity, also known as gene diversity ( \eqn{\hat{h}_s}, \code{Hs}) +\item total heterozygosity ( \eqn{\hat{h}_t}, \code{Ht}) +\item genetic differentiation between subpopulations (\eqn{D_{st}}, \code{Dst}) +\item corrected total population diversity (\eqn{h'_t}, \code{Htp}) +\item corrected genetic differentiation between subpopulations (\eqn{D'_{st}}, \code{Dstp}) +\item \eqn{\hat{F}_{ST}} (column header, \code{Fst}) +\item corrected \eqn{\hat{F'}_{ST}} (column header \code{Fstp}) +\item \eqn{\hat{F}_{IS}} (column header, \code{Fis}) +\item Jost's \eqn{\hat{D}} (column header, \code{Dest}) } } \details{ @@ -52,8 +52,35 @@ following equation 7.39 in Nei(1987) on pp.164. where \cr \eqn{\hat{x}_i=\sum_k \hat{x}_{ki}/s}\cr following equation 7.40 in Nei(1987) on pp.164.\cr -\item The amount of gene diversity among samples \eqn{Dst} is defined as:\cr -\eqn{Dst=Ht - Hs}\cr -\item Total corrected heterozygosity (total gene diversity) \eqn{\hat{h}_t} +\item The amount of gene diversity among samples \eqn{D_{ST}} is defined as:\cr +\eqn{D_{ST} = \hat{h}_t - \hat{h}_s}\cr +following the equation provided in the text at the top of page 165 in Nei(1987). +\item The corrected amount of gene diversity among samples \eqn{D'_{ST}} is defined as:\cr +\eqn{D'_{ST} = (s/(s-1))D'_{ST}}\cr +following the equation provided in the text at the top of page 165 in Nei(1987). +\item Total corrected heterozygosity (total gene diversity) \eqn{\hat{h}_t} is defined as:\cr +\eqn{\hat{h'}_t = \hat{h}_s + D'_{ST}}\cr +following the equation provided in the text at the top of page 165 in Nei(1987). +\item \eqn{\hat{F}_{IS}} is defined as:\cr +\eqn{\hat{F}_{IS} = 1 - \hat{h}_o/\hat{h}_s}\cr +following equation 7.41 in Nei(1987) on pp.164.\cr +\item \eqn{\hat{F}_{ST}} is defined as:\cr +\eqn{\hat{F}_{ST} = 1 - \hat{h}_s/\hat{h}_t = D_{ST}/\hat{h}_t}\cr +following equation 7.43 in Nei(1987) on pp.165.\cr +\item \eqn{\hat{F'}_{ST}} is defined as:\cr +\eqn{\hat{F'}_{ST} = D'_{ST}/\hat{h'}_t}\cr +following the explanation provided in the text at the top of page 165 in Nei(1987). +\item Jost's \eqn{\hat{D}} is defined as:\cr +\eqn{\hat{D} = (s/(s-1))((\hat{h'}_t-\hat{h}_s)/(1-\hat{h}_s))}\cr +as defined by Jost(2008) } + +All these statistics are first computed by locus, and then averaged across loci (including any +monorphic locus) to obtain genome-wide values. The function uses the same algorithm as +\code{\link[hierfstat:basic.stats]{hierfstat::basic.stats()}} but is optimized for speed and memory usage. +} +\references{ +Nei M, Chesser R (1983) Estimation of fixation indexes and gene diversities. Annals of Human Genetics, 47, 253-259. +Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press, pp. 164-165. +Jost L (2008) GST and its relatives do not measure differentiation. Molecular Ecology, 17, 4015-4026. } diff --git a/man/pop_het_exp.Rd b/man/pop_het_exp.Rd index 6e5adb5e..3d674f53 100644 --- a/man/pop_het_exp.Rd +++ b/man/pop_het_exp.Rd @@ -12,7 +12,12 @@ pop_het_exp( n_cores = bigstatsr::nb_cores() ) -pop_gene_div(...) +pop_gene_div( + .x, + by_locus = FALSE, + include_global = FALSE, + n_cores = bigstatsr::nb_cores() +) } \arguments{ \item{.x}{a \code{\link{gen_tibble}} (usually grouped, as obtained by using \code{\link[dplyr:group_by]{dplyr::group_by()}}, otherwise the full tibble From 561f91b1b2f1fec43e3df3aca2fb432a4c8bb493 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Fri, 18 Oct 2024 20:35:12 +0100 Subject: [PATCH 11/20] remove old function --- NAMESPACE | 1 - R/hierfstat_basic_stats.R | 103 ----------------------------------- man/hierfstat_basic_stats.Rd | 20 ------- 3 files changed, 124 deletions(-) delete mode 100644 R/hierfstat_basic_stats.R delete mode 100644 man/hierfstat_basic_stats.Rd diff --git a/NAMESPACE b/NAMESPACE index 2b3465bb..24f96000 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -103,7 +103,6 @@ export(gt_roh_window) export(gt_save) export(gt_set_imputed) export(gt_uses_imputed) -export(hierfstat_basic_stats) export(indiv_het_obs) export(indiv_missingness) export(indiv_ploidy) diff --git a/R/hierfstat_basic_stats.R b/R/hierfstat_basic_stats.R deleted file mode 100644 index 148153e6..00000000 --- a/R/hierfstat_basic_stats.R +++ /dev/null @@ -1,103 +0,0 @@ -#' Compute basic population statistics -#' -#' This function computes basic population statistics, following the formulas used in `hietfstat`. -#' -#' @param .x a [`gen_tibble`] (usually grouped, as obtained by using [dplyr::group_by()]; use on -#' a single population will return a number of quantities as NA/NaN) -#' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()] -#' @returns a tibble of population statistics, with populations as rows and statistics as columns -#' @export - - -# adapted from hierfstat -hierfstat_basic_stats <- function(.x, n_cores = bigstatsr::nb_cores()){ - stopifnot_diploid(.x) - # get the populations if it is a grouped gen_tibble - if (inherits(.x,"grouped_df")){ - .group_levels <- .x %>% group_keys() - .group_ids <- dplyr::group_indices(.x)-1 - } else { # create a dummy pop - .group_levels = tibble(population="pop") - .group_ids <- rep(0,nrow(.x)) - } - - # summarise population frequencies - pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, - rowInd = .gt_bigsnp_rows(.x), - colInd = .gt_bigsnp_cols(.x), - groupIds = .group_ids, - ngroups = nrow(.group_levels), - ncores = n_cores) - - # get the number of individuals - n <-pop_freqs_df$n/2 - # is this correct? - sHo <-pop_freqs_df$het_obs - mHo <- apply(sHo, 1, mean, na.rm = TRUE) - - # sum of squared frequencies - sp2 <- pop_freqs_df$freq_alt^2+pop_freqs_df$freq_ref^2 - Hs <- (1 - sp2 - sHo/2/n) - Hs <- n/(n - 1) * Hs - Fis = 1 - sHo/Hs - - np <- apply(n, 1, fun <- function(x) sum(!is.na(x))) - # mean sample size over the populations - mn <- apply(n, 1, fun <- function(x) { - sum(!is.na(x))/sum(1/x[!is.na(x)]) - }) - # mean sum of square frequencies - msp2 <- apply(sp2, 1, mean, na.rm = TRUE) - mp2 <- rowMeans(pop_freqs_df$freq_alt)^2+rowMeans(pop_freqs_df$freq_ref)^2 - mHs <- mn/(mn - 1) * (1 - msp2 - mHo/2/mn) - Ht <- 1 - mp2 + mHs/mn/np - mHo/2/mn/np - mFis = 1 - mHo/mHs - - Dst <- Ht - mHs - Dstp <- np/(np - 1) * Dst - Htp = mHs + Dstp - Fst = Dst/Ht - Fstp = Dstp/Htp - Dest <- Dstp/(1 - mHs) - res <- data.frame(cbind(mHo, mHs, Ht, Dst, Htp, Dstp, Fst, - Fstp, mFis, Dest)) - names(res) <- c("Ho", "Hs", "Ht", "Dst", "Htp", "Dstp", - "Fst", "Fstp", "Fis", "Dest") - # overall summaries - is.na(res) <- do.call(cbind, lapply(res, is.infinite)) - overall <- apply(res, 2, mean, na.rm = TRUE) - overall[7] <- overall[4]/overall[3] - overall[8] <- overall[6]/overall[5] - overall[9] <- 1 - overall[1]/overall[2] - overall[10] <- overall[6]/(1 - overall[2]) - names(overall) <- names(res) - - digits <- 4 - all.res <- list(n.ind.samp = n, Ho = round(sHo, digits), Hs = round(Hs, digits), - Fis = round(Fis, digits), perloc = round(res, digits), - overall = round(overall, digits)) - class(all.res) <- "basic.stats" - all.res - - - -} - -# loci_het_obs -# loci_het_exp -# loci_gene_div -# loci_dst -# loci_dst_prime -# loci_het_tot -# loci_het_top_prime -# loci_fst -# loci_fst_prime -# loci_fis -# loci_jost_d - -#pop_Ho -#pop_Hs -#pop_Fis - - - diff --git a/man/hierfstat_basic_stats.Rd b/man/hierfstat_basic_stats.Rd deleted file mode 100644 index 888262d8..00000000 --- a/man/hierfstat_basic_stats.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hierfstat_basic_stats.R -\name{hierfstat_basic_stats} -\alias{hierfstat_basic_stats} -\title{Compute basic population statistics} -\usage{ -hierfstat_basic_stats(.x, n_cores = bigstatsr::nb_cores()) -} -\arguments{ -\item{.x}{a \code{\link{gen_tibble}} (usually grouped, as obtained by using \code{\link[dplyr:group_by]{dplyr::group_by()}}; use on -a single population will return a number of quantities as NA/NaN)} - -\item{n_cores}{number of cores to be used, it defaults to \code{\link[bigstatsr:reexports]{bigstatsr::nb_cores()}}} -} -\value{ -a tibble of population statistics, with populations as rows and statistics as columns -} -\description{ -This function computes basic population statistics, following the formulas used in \code{hietfstat}. -} From 7180dcb0c94f1f96a50af30eac54a3045893b06e Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Fri, 18 Oct 2024 20:41:11 +0100 Subject: [PATCH 12/20] bump version --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3ec31e5c..48f53d78 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tidypopgen Title: Tidy Population Genetics -Version: 0.0.0.9017 +Version: 0.0.0.9018 Authors@R: c(person("Evie", "Carter", role = c("aut")), person("Andrea", "Manica", email = "am315@cam.ac.uk", From d5947da0b471bb345a8d5904302f25f6dce17967 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Fri, 18 Oct 2024 21:30:34 +0100 Subject: [PATCH 13/20] add some tests --- tests/testthat/test_pop_basic_stats.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/testthat/test_pop_basic_stats.R b/tests/testthat/test_pop_basic_stats.R index 00bb169f..78af26a1 100644 --- a/tests/testthat/test_pop_basic_stats.R +++ b/tests/testthat/test_pop_basic_stats.R @@ -70,5 +70,19 @@ test_that("pop_* basic stats functions work correctly",{ # test Fis errors for incorrect parameters for WG17 expect_error(test_gt %>% pop_fis(by_locus = TRUE, method= "WG17", allele_sharing_mat = matrix(1, nrow=7, ncol=7)), "by_locus not implemented for WG17") + # test alias for pop_het_exp + expect_identical(test_het_exp, test_gt %>% pop_gene_div(by_locus = FALSE, include_global = TRUE)) + +}) + +test_that("pop_* basic stats work on a single population",{ + test_het_obs <- test_gt %>% pop_het_obs(by_locus = TRUE) + expect_true(ncol(test_het_obs) == 1) # just one column, as we only have one population + test_fis <- test_gt %>% pop_fis(by_locus = TRUE) + expect_true(ncol(test_fis) == 1) # just one column, as we only have one population + test_het_exp <- test_gt %>% pop_het_exp(by_locus = TRUE) + expect_true(ncol(test_het_exp) == 1) # just one column, as we only have one population + test_global_stats <- test_gt %>% pop_global_stats(by_locus = TRUE) + expect_true(all(is.nan(test_global_stats$Fstp))) # Fstp is not defined for a single population }) From b760efb013afda97eaafe2f612a904557b3e0f4c Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Fri, 18 Oct 2024 21:46:37 +0100 Subject: [PATCH 14/20] clean up --- R/gt_helper_functions.R | 20 -------------------- R/gt_roh_window.R | 4 ++-- man/gt_roh_window.Rd | 4 ++-- 3 files changed, 4 insertions(+), 24 deletions(-) diff --git a/R/gt_helper_functions.R b/R/gt_helper_functions.R index bc58f58c..aa0bc217 100644 --- a/R/gt_helper_functions.R +++ b/R/gt_helper_functions.R @@ -10,23 +10,3 @@ .gt_get_bigsnp<-function(.x){ attr(.x$genotypes,"bigsnp") } - - -# a developer function to create various count summaries of a population, used to -# compute more complex statistics (e.g. pairwise fst, etc.). -.gt_pop_freqs <- function(.x){ - counts <- bigstatsr::big_counts( .gt_get_bigsnp(.x)$genotypes, - ind.row =.gt_bigsnp_rows(.x), - ind.col = .gt_bigsnp_cols(.x)) - sums_alt <- apply(counts,2,function(x) x[2]+2*x[3]) - n <- apply(counts,2,function(x) sum(x[1:3])*2) - sums_ref <- n - sums_alt - freq_alt <- sums_alt/n - freq_ref <- 1- freq_alt - het_obs <- apply(counts,2,function(x) x[2]/sum(x[1:3])) - return (list( - freq_alt = freq_alt, - freq_ref = freq_ref, - n = n, - het_obs = het_obs)) -} diff --git a/R/gt_roh_window.R b/R/gt_roh_window.R index fc4d3d12..ae38bbb1 100644 --- a/R/gt_roh_window.R +++ b/R/gt_roh_window.R @@ -42,8 +42,8 @@ #' @export #' #' @examples -#' # don't run the example -#' if (FALSE) { +#' # run the example only if we have the package installed +#' if (requireNamespace("detectRUNS", quietly = TRUE)) { #' sheep_ped <- system.file("extdata", "Kijas2016_Sheep_subset.ped", #' package="detectRUNS") #' sheep_gt <- tidypopgen::gen_tibble(sheep_ped, backingfile = tempfile(), diff --git a/man/gt_roh_window.Rd b/man/gt_roh_window.Rd index a569ba0b..441d8801 100644 --- a/man/gt_roh_window.Rd +++ b/man/gt_roh_window.Rd @@ -74,8 +74,8 @@ group table. Otherwise, the group 'column' is filled with the same values as the column } \examples{ -# don't run the example -if (FALSE) { +# run the example only if we have the package installed +if (requireNamespace("detectRUNS", quietly = TRUE)) { sheep_ped <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package="detectRUNS") sheep_gt <- tidypopgen::gen_tibble(sheep_ped, backingfile = tempfile(), From cc2b63317aef5ae9c1ae5450fabb018d81e881a4 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Fri, 18 Oct 2024 21:48:37 +0100 Subject: [PATCH 15/20] Add code coverage [skip ci] --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index bb11a444..bf8e1727 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![R-CMD-check](https://github.com/EvolEcolGroup/tidypopgen/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/EvolEcolGroup/tidypopgen/actions/workflows/R-CMD-check.yaml) +[![codecov](https://codecov.io/gh/EvolEcolGroup/tidypopgen/branch/main/graph/badge.svg?token=KLOzxJoLBO)](https://app.codecov.io/gh/EvolEcolGroup/tidypopgen) The goal of `tidypopgen` is to provide a tidy grammar of population genetics, facilitating From 08fd106f72e9f89944611370f290d076435a7a04 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Fri, 18 Oct 2024 22:09:57 +0100 Subject: [PATCH 16/20] A few more tests --- R/pop_fis.R | 2 +- R/pop_fst.R | 2 +- tests/testthat/test_pop_fis_fst.R | 10 ++++++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/R/pop_fis.R b/R/pop_fis.R index deb49cd7..90275325 100644 --- a/R/pop_fis.R +++ b/R/pop_fis.R @@ -83,7 +83,7 @@ pop_fis_nei87 <- function(.x, by_locus = FALSE, include_global=include_global, pop_fis_wg17 <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){ if (!inherits(.x,"grouped_df")){ - stop (".x should be a grouped df") + stop (".x should be a grouped gen_tibble") } if (is.null(allele_sharing_mat)){ allele_sharing_mat <- pairwise_allele_sharing(.x, as_matrix = TRUE) diff --git a/R/pop_fst.R b/R/pop_fst.R index 61e9c844..34758a8e 100644 --- a/R/pop_fst.R +++ b/R/pop_fst.R @@ -14,7 +14,7 @@ pop_fst <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){ if (!inherits(.x,"grouped_df")){ - stop (".x should be a grouped df") + stop (".x should be a grouped gen_tibble") } if (is.null(allele_sharing_mat)){ allele_sharing_mat <- pairwise_allele_sharing(.x, as_matrix = TRUE) diff --git a/tests/testthat/test_pop_fis_fst.R b/tests/testthat/test_pop_fis_fst.R index 142b088d..48b910f9 100644 --- a/tests/testthat/test_pop_fis_fst.R +++ b/tests/testthat/test_pop_fis_fst.R @@ -18,6 +18,14 @@ test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_in test_that("pop_fst and pop_fist WG17 compute correctly",{ + # expect error if the tibble is not grouped + expect_error(test_gt %>% pop_fis(method="WG17"), + ".x should be a grouped gen_tibble") + + expect_error(test_gt %>% pop_fst(), + ".x should be a grouped gen_tibble") + + # group the tibble test_gt <- test_gt %>% dplyr::group_by(population) # compare results against raw hierfstat code @@ -28,8 +36,6 @@ test_that("pop_fst and pop_fist WG17 compute correctly",{ fis_by_pop_sub <- test_gt %>% pop_fis(method = "WG17") expect_true (all.equal(fis_by_pop[-length(fis_by_pop)], fis_by_pop_sub)) - - # compare results against raw hierfstat code fst_by_pop <- test_gt %>% pop_fst(include_global=TRUE) fst_by_pop_hier <- hierfstat::fst.dosage(test_genotypes,pop=test_indiv_meta$population) From 6103d52f810b0c77ed655483d56c4aff3c3850e3 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Sat, 19 Oct 2024 10:01:37 +0100 Subject: [PATCH 17/20] fix tests --- tests/testthat/test_gen_tibble.R | 2 +- tests/testthat/test_q_matrix.R | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R index baa5c3e7..d6af24cd 100644 --- a/tests/testthat/test_gen_tibble.R +++ b/tests/testthat/test_gen_tibble.R @@ -549,7 +549,7 @@ test_that("gt without population is valid",{ gt_load(file_names[1]) # will fail in functions that require grouping - expect_error(pop_fst(test_gt), ".x should be a grouped df") + expect_error(pop_fst(test_gt), ".x should be a grouped gen_tibble") # functions that require grouping work after adding groups test_gt$groups <- c("A","A","B") test_gt <- test_gt %>% group_by(groups) diff --git a/tests/testthat/test_q_matrix.R b/tests/testthat/test_q_matrix.R index d6f5c273..3f29dc56 100644 --- a/tests/testthat/test_q_matrix.R +++ b/tests/testthat/test_q_matrix.R @@ -15,9 +15,7 @@ test_that("q_matrix reads q_files",{ expect_true(inherits(anolis_q_k3_mat,"q_matrix")) #read multiple .Q - package_path <- find.package("tidypopgen") - folder <- "/extdata/anolis" - q_folder <- file.path(package_path, folder) + q_folder <- system.file("/extdata/anolis", package = "tidypopgen") anolis_q <- q_matrix(q_folder) expect_true(inherits(anolis_q,"q_matrix_list")) expect_true(inherits(anolis_q[[1]][[1]],"q_matrix")) From f4fb7abc133ad0d1ad5eccfd5265940a9a3934c7 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Sat, 19 Oct 2024 10:21:14 +0100 Subject: [PATCH 18/20] don't fix number of cores when reading vcf --- R/gen_tibble.R | 1 + R/vcf_to_fbm_cpp.R | 8 ++++---- man/gen_tibble.Rd | 2 ++ man/vcf_to_fbm_cpp.Rd | 14 +++++++++++--- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/R/gen_tibble.R b/R/gen_tibble.R index b7efe37b..16a752db 100644 --- a/R/gen_tibble.R +++ b/R/gen_tibble.R @@ -37,6 +37,7 @@ #' a fast C++ parser, or "vcfR" to use the R package `vcfR`. The latter is slower #' but more robust; if "cpp" gives error, try using "vcfR" in case your VCF has #' an unusual structure. +#' @param n_cores the number of cores to use for parallel processing #' @param valid_alleles a vector of valid allele values; it defaults to 'A','T', #' 'C' and 'G'. #' @param missing_alleles a vector of values in the BIM file/loci dataframe that diff --git a/R/vcf_to_fbm_cpp.R b/R/vcf_to_fbm_cpp.R index cdf1f60f..c1ee1167 100644 --- a/R/vcf_to_fbm_cpp.R +++ b/R/vcf_to_fbm_cpp.R @@ -4,8 +4,9 @@ #' This should work even for large vcf files that would not fit in memory. #' #' @param vcf_path the path to the vcf -#' @param chunk_size the chunk size to use on the vcf when loading the file -#' @param backingfile the name of the file to use as the backing file +#' @param chunk_size the chunk size to use on the vcf when loading the file. If NULL, a best guess will be made. +#' @param backingfile the name of the file to use as the backing file for the FBM. If NULL, the vcf path will be used. +#' @param n_cores the number of cores to use when reading the vcf file. Default is 1. #' @return path to the resulting rds file as class bigSNP. #' @keywords internal @@ -13,10 +14,9 @@ vcf_to_fbm_cpp <- function( vcf_path, chunk_size = NULL, backingfile = NULL, + n_cores = 1, quiet=FALSE) { - n_cores <- 3 - if (is.null(backingfile)){ backingfile <- vcf_path backingfile <- sub("\\.vcf.gz$", "", backingfile) diff --git a/man/gen_tibble.Rd b/man/gen_tibble.Rd index b86be381..9607151a 100644 --- a/man/gen_tibble.Rd +++ b/man/gen_tibble.Rd @@ -101,6 +101,8 @@ the files.} a vector of values for mixed ploidy). Only used if creating a gen_tibble from a matrix of data; otherwise, ploidy is determined automatically from the data as they are read.} + +\item{n_cores}{the number of cores to use for parallel processing} } \value{ an object of the class \code{gen_tbl}. diff --git a/man/vcf_to_fbm_cpp.Rd b/man/vcf_to_fbm_cpp.Rd index 3e5878a4..ee02aeb9 100644 --- a/man/vcf_to_fbm_cpp.Rd +++ b/man/vcf_to_fbm_cpp.Rd @@ -4,14 +4,22 @@ \alias{vcf_to_fbm_cpp} \title{Convert vcf to FBM.} \usage{ -vcf_to_fbm_cpp(vcf_path, chunk_size = NULL, backingfile = NULL, quiet = FALSE) +vcf_to_fbm_cpp( + vcf_path, + chunk_size = NULL, + backingfile = NULL, + n_cores = 1, + quiet = FALSE +) } \arguments{ \item{vcf_path}{the path to the vcf} -\item{chunk_size}{the chunk size to use on the vcf when loading the file} +\item{chunk_size}{the chunk size to use on the vcf when loading the file. If NULL, a best guess will be made.} -\item{backingfile}{the name of the file to use as the backing file} +\item{backingfile}{the name of the file to use as the backing file for the FBM. If NULL, the vcf path will be used.} + +\item{n_cores}{the number of cores to use when reading the vcf file. Default is 1.} } \value{ path to the resulting rds file as class bigSNP. From 71f2f3e225255a8bc1c10b7c37ecc19826c68337 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Sat, 19 Oct 2024 10:31:36 +0100 Subject: [PATCH 19/20] fix docs issue --- R/gen_tibble.R | 2 ++ man/gen_tibble.Rd | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/R/gen_tibble.R b/R/gen_tibble.R index 16a752db..8c276811 100644 --- a/R/gen_tibble.R +++ b/R/gen_tibble.R @@ -77,6 +77,7 @@ gen_tibble.character <- function(x, ..., parser = c("vcfR","cpp"), + n_cores = 1, chunk_size = NULL, valid_alleles = c("A", "T", "C", "G"), missing_alleles = c("0","."), @@ -108,6 +109,7 @@ gen_tibble.character <- quiet = quiet) } else if ((tolower(file_ext(x))=="vcf") || (tolower(file_ext(x))=="gz")){ return(gen_tibble_vcf(x = x, ..., parser = parser, chunk_size = chunk_size, + n_cores = n_cores, valid_alleles= valid_alleles, missing_alleles= missing_alleles, backingfile = backingfile, quiet = quiet)) diff --git a/man/gen_tibble.Rd b/man/gen_tibble.Rd index 9607151a..b1dd9fd5 100644 --- a/man/gen_tibble.Rd +++ b/man/gen_tibble.Rd @@ -19,6 +19,7 @@ gen_tibble( x, ..., parser = c("vcfR", "cpp"), + n_cores = 1, chunk_size = NULL, valid_alleles = c("A", "T", "C", "G"), missing_alleles = c("0", "."), @@ -83,6 +84,8 @@ a fast C++ parser, or "vcfR" to use the R package \code{vcfR}. The latter is slo but more robust; if "cpp" gives error, try using "vcfR" in case your VCF has an unusual structure.} +\item{n_cores}{the number of cores to use for parallel processing} + \item{chunk_size}{the number of loci or individuals (depending on the format) processed at a time (currently used if \code{x} is a vcf or packedancestry file)} @@ -101,8 +104,6 @@ the files.} a vector of values for mixed ploidy). Only used if creating a gen_tibble from a matrix of data; otherwise, ploidy is determined automatically from the data as they are read.} - -\item{n_cores}{the number of cores to use for parallel processing} } \value{ an object of the class \code{gen_tbl}. From c71fcf3227beb5732a43635a22690d556c5d9855 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Sat, 19 Oct 2024 10:47:22 +0100 Subject: [PATCH 20/20] additional vcf tests --- tests/testthat/test_gen_tibble.R | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R index d6af24cd..51c55148 100644 --- a/tests/testthat/test_gen_tibble.R +++ b/tests/testthat/test_gen_tibble.R @@ -190,7 +190,7 @@ test_that("gen_tibble from files",{ expect_true(all.equal(show_genotypes(pop_a_gt),show_genotypes(pop_a_vcf_gt))) # reload it in chunks pop_a_vcf_gt2 <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), - chunk_size = 2) + chunk_size = 2, parser="vcfR") expect_true(all.equal(show_genotypes(pop_a_vcf_gt2),show_genotypes(pop_a_vcf_gt))) expect_true(all.equal(show_loci(pop_a_vcf_gt2),show_loci(pop_a_vcf_gt))) expect_true(is.integer(show_loci(pop_a_vcf_gt)$chr_int)) @@ -235,11 +235,12 @@ test_that("gen_tibble from files with missingness",{ # PLINK VCF files ######################## vcf_path <- system.file("extdata/pop_b.vcf", package = "tidypopgen") - pop_b_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile()) + pop_b_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), + parser="vcfR") expect_true(all.equal(show_genotypes(pop_b_gt),show_genotypes(pop_b_vcf_gt))) # reload it in chunks pop_b_vcf_gt2 <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), - chunk_size = 2) + chunk_size = 2, parser = "vcfR") expect_true(all.equal(show_genotypes(pop_b_vcf_gt2),show_genotypes(pop_b_vcf_gt))) expect_true(all.equal(show_loci(pop_b_vcf_gt2),show_loci(pop_b_vcf_gt))) expect_true(is.integer(show_loci(pop_b_vcf_gt2)$chr_int)) @@ -562,6 +563,21 @@ test_that("gt without population is valid",{ }) +test_that("additional vcf tests with larger file",{ + vcf_path <- system.file("/extdata/anolis/punctatus_t70_s10_n46_filtered.recode.vcf.gz", + package = "tidypopgen") + anole_gt <- gen_tibble(vcf_path, quiet = TRUE, parser = "cpp", backingfile = tempfile("anolis_")) + anole_gt_vcfr <- gen_tibble(vcf_path, quiet = TRUE, parser = "vcfR", + backingfile = tempfile("anolis_")) + expect_true(all.equal(show_genotypes(anole_gt),show_genotypes(anole_gt_vcfr))) + anole_gt2 <- gen_tibble(vcf_path, quiet = TRUE, parser = "cpp", backingfile = tempfile("anolis_"), + chunk_size = 1000, n_cores = 2) + expect_true(all.equal(show_genotypes(anole_gt2),show_genotypes(anole_gt_vcfr))) +} +) + + + # Windows prevents the deletion of the backing file. It's something to do with the memory mapping # library used by bigsnpr # test_that("on error, we remove the old files",{