diff --git a/DESCRIPTION b/DESCRIPTION index 3ec31e5c..48f53d78 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tidypopgen Title: Tidy Population Genetics -Version: 0.0.0.9017 +Version: 0.0.0.9018 Authors@R: c(person("Evie", "Carter", role = c("aut")), person("Andrea", "Manica", email = "am315@cam.ac.uk", diff --git a/NAMESPACE b/NAMESPACE index 9089b068..24f96000 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,13 +24,10 @@ S3method(dplyr_row_slice,grouped_gen_tbl) S3method(gen_tibble,character) S3method(gen_tibble,matrix) S3method(group_by,gen_tbl) -S3method(indiv_het_obs,grouped_df) S3method(indiv_het_obs,tbl_df) S3method(indiv_het_obs,vctrs_bigSNP) -S3method(indiv_missingness,grouped_df) S3method(indiv_missingness,tbl_df) S3method(indiv_missingness,vctrs_bigSNP) -S3method(indiv_ploidy,grouped_df) S3method(indiv_ploidy,tbl_df) S3method(indiv_ploidy,vctrs_bigSNP) S3method(loci_alt_freq,grouped_df) @@ -124,6 +121,10 @@ export(pairwise_king) export(pairwise_pop_fst) export(pop_fis) export(pop_fst) +export(pop_gene_div) +export(pop_global_stats) +export(pop_het_exp) +export(pop_het_obs) export(q_matrix) export(qc_report_indiv) export(qc_report_loci) diff --git a/R/gen_tibble.R b/R/gen_tibble.R index b7efe37b..8c276811 100644 --- a/R/gen_tibble.R +++ b/R/gen_tibble.R @@ -37,6 +37,7 @@ #' a fast C++ parser, or "vcfR" to use the R package `vcfR`. The latter is slower #' but more robust; if "cpp" gives error, try using "vcfR" in case your VCF has #' an unusual structure. +#' @param n_cores the number of cores to use for parallel processing #' @param valid_alleles a vector of valid allele values; it defaults to 'A','T', #' 'C' and 'G'. #' @param missing_alleles a vector of values in the BIM file/loci dataframe that @@ -76,6 +77,7 @@ gen_tibble.character <- function(x, ..., parser = c("vcfR","cpp"), + n_cores = 1, chunk_size = NULL, valid_alleles = c("A", "T", "C", "G"), missing_alleles = c("0","."), @@ -107,6 +109,7 @@ gen_tibble.character <- quiet = quiet) } else if ((tolower(file_ext(x))=="vcf") || (tolower(file_ext(x))=="gz")){ return(gen_tibble_vcf(x = x, ..., parser = parser, chunk_size = chunk_size, + n_cores = n_cores, valid_alleles= valid_alleles, missing_alleles= missing_alleles, backingfile = backingfile, quiet = quiet)) diff --git a/R/gt_helper_functions.R b/R/gt_helper_functions.R index bc58f58c..aa0bc217 100644 --- a/R/gt_helper_functions.R +++ b/R/gt_helper_functions.R @@ -10,23 +10,3 @@ .gt_get_bigsnp<-function(.x){ attr(.x$genotypes,"bigsnp") } - - -# a developer function to create various count summaries of a population, used to -# compute more complex statistics (e.g. pairwise fst, etc.). -.gt_pop_freqs <- function(.x){ - counts <- bigstatsr::big_counts( .gt_get_bigsnp(.x)$genotypes, - ind.row =.gt_bigsnp_rows(.x), - ind.col = .gt_bigsnp_cols(.x)) - sums_alt <- apply(counts,2,function(x) x[2]+2*x[3]) - n <- apply(counts,2,function(x) sum(x[1:3])*2) - sums_ref <- n - sums_alt - freq_alt <- sums_alt/n - freq_ref <- 1- freq_alt - het_obs <- apply(counts,2,function(x) x[2]/sum(x[1:3])) - return (list( - freq_alt = freq_alt, - freq_ref = freq_ref, - n = n, - het_obs = het_obs)) -} diff --git a/R/gt_roh_window.R b/R/gt_roh_window.R index fc4d3d12..ae38bbb1 100644 --- a/R/gt_roh_window.R +++ b/R/gt_roh_window.R @@ -42,8 +42,8 @@ #' @export #' #' @examples -#' # don't run the example -#' if (FALSE) { +#' # run the example only if we have the package installed +#' if (requireNamespace("detectRUNS", quietly = TRUE)) { #' sheep_ped <- system.file("extdata", "Kijas2016_Sheep_subset.ped", #' package="detectRUNS") #' sheep_gt <- tidypopgen::gen_tibble(sheep_ped, backingfile = tempfile(), diff --git a/R/indiv_het_obs.R b/R/indiv_het_obs.R index f8a11932..ac4522e0 100644 --- a/R/indiv_het_obs.R +++ b/R/indiv_het_obs.R @@ -48,8 +48,8 @@ indiv_het_obs.vctrs_bigSNP <- function(.x, ...){ this_col_1_na[1,]/(ncol(X)-this_col_1_na[2,]) } -#' @export -#' @rdname indiv_het_obs -indiv_het_obs.grouped_df <- function(.x, ...){ - .x %>% mutate(indiv_het_obs = indiv_het_obs(.data$genotypes)) %>% summarise(het_obs = mean(indiv_het_obs)) -} +# #' @export +# #' @rdname indiv_het_obs +# indiv_het_obs.grouped_df <- function(.x, ...){ +# .x %>% mutate(indiv_het_obs = indiv_het_obs(.data$genotypes)) %>% summarise(het_obs = mean(indiv_het_obs)) +# } diff --git a/R/indiv_missingness.R b/R/indiv_missingness.R index 91ec948b..a3c6e959 100644 --- a/R/indiv_missingness.R +++ b/R/indiv_missingness.R @@ -42,9 +42,9 @@ indiv_missingness.vctrs_bigSNP <- function(.x, as_counts = FALSE, ...){ row_na } -#' @export -#' @rdname indiv_missingness -indiv_missingness.grouped_df <- function(.x, as_counts = FALSE, ...){ - .x %>% mutate(indiv_missingness = indiv_missingness(.data$genotypes, as_counts = as_counts)) %>% - summarise(het_obs = mean(indiv_missingness)) -} +#' #' @export +#' #' @rdname indiv_missingness +#' indiv_missingness.grouped_df <- function(.x, as_counts = FALSE, ...){ +#' .x %>% mutate(indiv_missingness = indiv_missingness(.data$genotypes, as_counts = as_counts)) %>% +#' summarise(het_obs = mean(indiv_missingness)) +#' } diff --git a/R/indiv_ploidy.R b/R/indiv_ploidy.R index ccedd88c..fc92fab7 100644 --- a/R/indiv_ploidy.R +++ b/R/indiv_ploidy.R @@ -31,9 +31,9 @@ indiv_ploidy.vctrs_bigSNP <- function(.x, ...){ } } -#' @export -#' @rdname indiv_ploidy -indiv_ploidy.grouped_df <- function(.x, ...){ - .x %>% mutate(indiv_ploidy = indiv_ploidy(.data$genotypes)) %>% - summarise(ploidy = mean(indiv_ploidy)) -} +#' #' @export +#' #' @rdname indiv_ploidy +#' indiv_ploidy.grouped_df <- function(.x, ...){ +#' .x %>% mutate(indiv_ploidy = indiv_ploidy(.data$genotypes)) %>% +#' summarise(ploidy = mean(indiv_ploidy)) +#' } diff --git a/R/pop_fis.R b/R/pop_fis.R index 268dcc9a..90275325 100644 --- a/R/pop_fis.R +++ b/R/pop_fis.R @@ -1,19 +1,89 @@ #' Compute population specific FIS #' -#' This function computes population specific FIS (as computed by [hierfstat::fis.dosage()]). +#' This function computes population specific FIS, using either the approach of Nei 1987 (as computed by [hierfstat::basic.stats()]) or of Weir and Goudet 2017 +#' (as computed by [hierfstat::fis.dosage()]). +#' @references +#' Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press +#' Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085 #' @param .x a grouped [`gen_tibble`] (as obtained by using [dplyr::group_by()]) -#' @param include_global boolean determining whether, besides the population specific fis, a global -#' fis should be appended. Note that this will return a vector of n populations plus 1 (the global value) -#' @param allele_sharing_mat optional, the matrix of Allele Sharing returned by +#' @param method one of "Nei87" (based on Nei 1987, eqn 7.41) or "WG17" (for Weir and Goudet 2017) to compute FIS +#' @param by_locus boolean, determining whether FIS should be returned by locus(TRUE), +#' or as a single genome wide value (FALSE, the default). Note that this is only relevant for "Nei87", +#' as "WG17" always returns a single value. +#' @param include_global boolean determining whether, besides the population specific estiamtes, a global +#' estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value), +#' or a matrix with n+1 columns if `by_locus=TRUE`. +#' @param allele_sharing_mat optional and only relevant for "WG17", the matrix of Allele Sharing returned by #' [pairwise_allele_sharing()] with `as_matrix=TRUE`. As a number of statistics can be #' derived from the Allele Sharing matrix, #' it it sometimes more efficient to pre-compute this matrix. #' @returns a vector of population specific fis (plus the global value if `include_global=TRUE`) #' @export -pop_fis <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){ +pop_fis <- function(.x, method = c("Nei87", "WG17"), by_locus = FALSE, include_global=FALSE, allele_sharing_mat = NULL){ + method <- match.arg(method) + if (method == "Nei87"){ + if (!is.null(allele_sharing_mat)){ + stop("allele_sharing_mat not relevant for Nei87") + } + pop_fis_nei87(.x, by_locus = by_locus, include_global = include_global) + } else if (method == "WG17"){ + if (by_locus){ + stop("by_locus not implemented for WG17") + } + pop_fis_wg17(.x, include_global=include_global, allele_sharing_mat = allele_sharing_mat) + } +} + +pop_fis_nei87 <- function(.x, by_locus = FALSE, include_global=include_global, + n_cores = bigstatsr::nb_cores()){ + stopifnot_diploid(.x) + # get the populations if it is a grouped gen_tibble + if (inherits(.x,"grouped_df")){ + .group_levels <- .x %>% group_keys() + .group_ids <- dplyr::group_indices(.x)-1 + } else { # create a dummy pop + .group_levels = tibble(population="pop") + .group_ids <- rep(0,nrow(.x)) + } + + # summarise population frequencies + pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, + rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = .group_ids, + ngroups = nrow(.group_levels), + ncores = n_cores) + sHo <-pop_freqs_df$het_obs + mHo <- apply(sHo, 1, mean, na.rm = TRUE) + n <-pop_freqs_df$n/2 + # sum of squared frequencies + sp2 <- pop_freqs_df$freq_alt^2+pop_freqs_df$freq_ref^2 + Hs <- (1 - sp2 - sHo/2/n) + Hs <- n/(n - 1) * Hs + Fis = 1 - sHo/Hs + + colnames(Fis)<- .group_levels %>% dplyr::pull(1) + if (by_locus){ + if (include_global){ + global <- (.x %>% pop_global_stats(by_locus = TRUE, n_cores = n_cores))$Fis + Fis <- cbind(Fis, global) + } + } else { + Fis <- colMeans(Fis, na.rm = TRUE) + if (include_global){ + global <- (.x %>% pop_global_stats(by_locus = FALSE, n_cores = n_cores))["Fis"] + names(global) <- "global" + Fis <- c(Fis, global) + } + } + return(Fis) +} + + +pop_fis_wg17 <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){ if (!inherits(.x,"grouped_df")){ - stop (".x should be a grouped df") + stop (".x should be a grouped gen_tibble") } if (is.null(allele_sharing_mat)){ allele_sharing_mat <- pairwise_allele_sharing(.x, as_matrix = TRUE) diff --git a/R/pop_fst.R b/R/pop_fst.R index 99e84280..34758a8e 100644 --- a/R/pop_fst.R +++ b/R/pop_fst.R @@ -1,6 +1,8 @@ #' Compute population specific Fst #' -#' This function computes population specific Fst (as computed by [hierfstat::fst.dosage()]). +#' This function computes population specific Fst, using the approach in Weir and Goudet 2017 +#' (as computed by [hierfstat::fst.dosage()]). +#' @references Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085 #' @param .x a grouped [`gen_tibble`] (as obtained by using [dplyr::group_by()]) #' @param include_global boolean determining whether, besides the population specific Fst, a global #' Fst should be appended. Note that this will return a vector of n populations plus 1 (the global value) @@ -12,7 +14,7 @@ pop_fst <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){ if (!inherits(.x,"grouped_df")){ - stop (".x should be a grouped df") + stop (".x should be a grouped gen_tibble") } if (is.null(allele_sharing_mat)){ allele_sharing_mat <- pairwise_allele_sharing(.x, as_matrix = TRUE) diff --git a/R/pop_global_stats.R b/R/pop_global_stats.R new file mode 100644 index 00000000..9941c0a5 --- /dev/null +++ b/R/pop_global_stats.R @@ -0,0 +1,150 @@ +#' Compute basic population global statistics +#' +#' This function computes basic population global statistics, following the notation in Nei 1987 (which in turn is based on Nei and Chesser 1983): +#' - observed heterozygosity ( \eqn{\hat{h}_o}, column header `Ho`) +#' - expected heterozygosity, also known as gene diversity ( \eqn{\hat{h}_s}, `Hs`) +#' - total heterozygosity ( \eqn{\hat{h}_t}, `Ht`) +#' - genetic differentiation between subpopulations (\eqn{D_{st}}, `Dst`) +#' - corrected total population diversity (\eqn{h'_t}, `Htp`) +#' - corrected genetic differentiation between subpopulations (\eqn{D'_{st}}, `Dstp`) +#' - \eqn{\hat{F}_{ST}} (column header, `Fst`) +#' - corrected \eqn{\hat{F'}_{ST}} (column header `Fstp`) +#' - \eqn{\hat{F}_{IS}} (column header, `Fis`) +#' - Jost's \eqn{\hat{D}} (column header, `Dest`) +#' +#' @details We use the notation of Nei 1987. That notation was for loci with \eqn{m} alleles, but in our case we only have two alleles, so `m=2`. +#' - Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr +#' \eqn{\hat{h}_o= 1-\sum_{k=1}^{s} \sum_{i=1}^{m} \hat{X}_{kii}/s}\cr +#' where\cr +#' \eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr +#' \eqn{s} the number of populations,\cr +#' following equation 7.38 in Nei(1987) on pp.164.\cr +#' +#' - Within population expected heterozygosity (gene diversity) \eqn{\hat{h}_s} for a locus with \eqn{m} alleles is defined as:\cr +#' \eqn{\hat{h}_s=(\tilde{n}/(\tilde{n}-1))[1-\sum_{i=1}^{m}\bar{\hat{x}_i^2}-\hat{h}_o/2\tilde{n}]}\cr +#' where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) and\cr +#' \eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr +#' following equation 7.39 in Nei(1987) on pp.164. +#' +#' - Total heterozygosity (total gene diversity) \eqn{\hat{h}_t} for a locus with \eqn{m} alleles is defined as:\cr +#' \eqn{\hat{h}_t = 1-\sum_{i=1}^{m} \bar{\hat{x}_i^2} + \hat{h}_s/(\tilde{n}s) - \hat{h}_o/(2\tilde{n}s)}\cr +#' where \cr +#' \eqn{\hat{x}_i=\sum_k \hat{x}_{ki}/s}\cr +#' following equation 7.40 in Nei(1987) on pp.164.\cr +#' +#' - The amount of gene diversity among samples \eqn{D_{ST}} is defined as:\cr +#' \eqn{D_{ST} = \hat{h}_t - \hat{h}_s}\cr +#' following the equation provided in the text at the top of page 165 in Nei(1987). +#' +#' - The corrected amount of gene diversity among samples \eqn{D'_{ST}} is defined as:\cr +#' \eqn{D'_{ST} = (s/(s-1))D'_{ST}}\cr +#' following the equation provided in the text at the top of page 165 in Nei(1987). +#' +#' - Total corrected heterozygosity (total gene diversity) \eqn{\hat{h}_t} is defined as:\cr +#' \eqn{\hat{h'}_t = \hat{h}_s + D'_{ST}}\cr +#' following the equation provided in the text at the top of page 165 in Nei(1987). +#' - \eqn{\hat{F}_{IS}} is defined as:\cr +#' \eqn{\hat{F}_{IS} = 1 - \hat{h}_o/\hat{h}_s}\cr +#' following equation 7.41 in Nei(1987) on pp.164.\cr +#' +#' - \eqn{\hat{F}_{ST}} is defined as:\cr +#' \eqn{\hat{F}_{ST} = 1 - \hat{h}_s/\hat{h}_t = D_{ST}/\hat{h}_t}\cr +#' following equation 7.43 in Nei(1987) on pp.165.\cr +#' +#' - \eqn{\hat{F'}_{ST}} is defined as:\cr +#' \eqn{\hat{F'}_{ST} = D'_{ST}/\hat{h'}_t}\cr +#' following the explanation provided in the text at the top of page 165 in Nei(1987). +#' +#' - Jost's \eqn{\hat{D}} is defined as:\cr +#' \eqn{\hat{D} = (s/(s-1))((\hat{h'}_t-\hat{h}_s)/(1-\hat{h}_s))}\cr +#' as defined by Jost(2008) +#' +#' All these statistics are first computed by locus, and then averaged across loci (including any +#' monorphic locus) to obtain genome-wide values. The function uses the same algorithm as +#' [hierfstat::basic.stats()] but is optimized for speed and memory usage. +#' +#' @references Nei M, Chesser R (1983) Estimation of fixation indexes and gene diversities. Annals of Human Genetics, 47, 253-259. +#' Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press, pp. 164-165. +#' Jost L (2008) GST and its relatives do not measure differentiation. Molecular Ecology, 17, 4015-4026. +#' +#' @param .x a [`gen_tibble`] (usually grouped, as obtained by using [dplyr::group_by()]; use on +#' a single population will return a number of quantities as NA/NaN) +#' @param by_locus boolean, determining whether the statistics should be returned by locus(TRUE), +#' or as a single genome wide value (FALSE, the default). +#' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()] +#' @returns a tibble of population statistics, with populations as rows and statistics as columns +#' @export + + +# this code is adapted from hierfstat::basic.stats by Jerome Goudet +pop_global_stats <- function(.x, by_locus = FALSE, n_cores = bigstatsr::nb_cores()){ + stopifnot_diploid(.x) + # get the populations if it is a grouped gen_tibble + if (inherits(.x,"grouped_df")){ + .group_levels <- .x %>% group_keys() + .group_ids <- dplyr::group_indices(.x)-1 + } else { # create a dummy pop + .group_levels = tibble(population="pop") + .group_ids <- rep(0,nrow(.x)) + } + + # summarise population frequencies + pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, + rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = .group_ids, + ngroups = nrow(.group_levels), + ncores = n_cores) + + # get the number of individuals + n <-pop_freqs_df$n/2 + # is this correct? + sHo <-pop_freqs_df$het_obs + mHo <- apply(sHo, 1, mean, na.rm = TRUE) + + # sum of squared frequencies + sp2 <- pop_freqs_df$freq_alt^2+pop_freqs_df$freq_ref^2 +# Hs <- (1 - sp2 - sHo/2/n) +# Hs <- n/(n - 1) * Hs +# Fis = 1 - sHo/Hs + + np <- apply(n, 1, fun <- function(x) sum(!is.na(x))) + # mean sample size over the populations + mn <- apply(n, 1, fun <- function(x) { + sum(!is.na(x))/sum(1/x[!is.na(x)]) + }) + # mean sum of square frequencies + msp2 <- apply(sp2, 1, mean, na.rm = TRUE) + mp2 <- rowMeans(pop_freqs_df$freq_alt)^2+rowMeans(pop_freqs_df$freq_ref)^2 + mHs <- mn/(mn - 1) * (1 - msp2 - mHo/2/mn) + Ht <- 1 - mp2 + mHs/mn/np - mHo/2/mn/np + mFis = 1 - mHo/mHs + + Dst <- Ht - mHs + Dstp <- np/(np - 1) * Dst + Htp = mHs + Dstp + Fst = Dst/Ht + Fstp = Dstp/Htp + Dest <- Dstp/(1 - mHs) + res <- data.frame(cbind(mHo, mHs, Ht, Dst, Htp, Dstp, Fst, + Fstp, mFis, Dest)) + names(res) <- c("Ho", "Hs", "Ht", "Dst", "Htp", "Dstp", + "Fst", "Fstp", "Fis", "Dest") + + if (by_locus){ + return(res) + } else { + # overall summaries + is.na(res) <- do.call(cbind, lapply(res, is.infinite)) + overall <- apply(res, 2, mean, na.rm = TRUE) + overall[7] <- overall[4]/overall[3] + overall[8] <- overall[6]/overall[5] + overall[9] <- 1 - overall[1]/overall[2] + overall[10] <- overall[6]/(1 - overall[2]) + names(overall) <- names(res) + return(overall) + } +} + + + diff --git a/R/pop_het_exp.R b/R/pop_het_exp.R new file mode 100644 index 00000000..83aa5a7b --- /dev/null +++ b/R/pop_het_exp.R @@ -0,0 +1,78 @@ +#' Compute the population expected heterozygosity +#' +#' This function computes expected population heterozygosity (also +#' referred to as gene diversity, to avoid the potentially misleading use of the term "expected" in this context), using the formula of Nei (1987). +#' +#' @details Within population expected heterozygosity (gene diversity) \eqn{\hat{h}_s} for a locus with \eqn{m} alleles is defined as:\cr +#' \eqn{\hat{h}_s=\tilde{n}/(\tilde{n}-1)[1-\sum_{i}^{m}\bar{\hat{x}_i^2}-\hat{h}_o/2\tilde{n}]}\cr +#' +#' where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) and\cr +#' \eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr +#' following equation 7.39 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. \eqn{\hat{h}_s} at +#' the genome level for each population is simply the mean of the locus estimates for each population. +#' +#' @references Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press +#' +#' @param .x a [`gen_tibble`] (usually grouped, as obtained by using [dplyr::group_by()], otherwise the full tibble +#' will be considered as belonging to a single population). +#' @param by_locus boolean, determining whether Hs should be returned by locus(TRUE), +#' or as a single genome wide value (FALSE, the default). +#' @param include_global boolean determining whether, besides the population specific estiamtes, a global +#' estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value), +#' or a matrix with n+1 columns if `by_locus=TRUE`. +#' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()] +#' @returns a vector of mean population observed heterozygosities (if `by_locus=FALSE`), or a matrix of +#' estimates by locus (rows are loci, columns are populations, `by_locus=TRUE`) +#' @export + + +# adapted from hierfstat +pop_het_exp <- function(.x, by_locus = FALSE, include_global = FALSE, n_cores = bigstatsr::nb_cores()){ + stopifnot_diploid(.x) + # get the populations if it is a grouped gen_tibble + if (inherits(.x,"grouped_df")){ + .group_levels <- .x %>% group_keys() + .group_ids <- dplyr::group_indices(.x)-1 + } else { # create a dummy pop + .group_levels = tibble(population="pop") + .group_ids <- rep(0,nrow(.x)) + } + + # summarise population frequencies + pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, + rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = .group_ids, + ngroups = nrow(.group_levels), + ncores = n_cores) + sHo <-pop_freqs_df$het_obs + mHo <- apply(sHo, 1, mean, na.rm = TRUE) + n <-pop_freqs_df$n/2 + # sum of squared frequencies + sp2 <- pop_freqs_df$freq_alt^2+pop_freqs_df$freq_ref^2 + Hs <- (1 - sp2 - sHo/2/n) + Hs <- n/(n - 1) * Hs + + colnames(Hs)<- .group_levels %>% dplyr::pull(1) + if (include_global){ + global <- (.x %>% pop_global_stats(by_locus = TRUE, n_cores = n_cores))$Hs + Hs <- cbind(Hs, global) + } + + if (by_locus){ + return(Hs) + } else { + return(colMeans(Hs, na.rm = TRUE)) + } +} + + +# alias for gene diversity +#' @export +#' @rdname pop_het_exp +pop_gene_div <- function (.x, by_locus = FALSE, include_global = FALSE, n_cores = bigstatsr::nb_cores()){ + pop_het_exp(.x=.x, + by_locus = by_locus, + include_global = include_global, + n_cores = n_cores) +} diff --git a/R/pop_het_obs.R b/R/pop_het_obs.R new file mode 100644 index 00000000..08b64408 --- /dev/null +++ b/R/pop_het_obs.R @@ -0,0 +1,61 @@ +#' Compute the population observed heterozygosity +#' +#' This function computes population heterozygosity, using the formula of Nei (1987). +#' +#' @details Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr +#' \eqn{\hat{h}_o= 1-\sum_{k=1}^{s} \sum_{i=1}^{m} \hat{X}_{kii}/s}\cr +#' where\cr +#' \eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr +#' \eqn{s} the number of populations,\cr +#' following equation 7.38 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. For +#' population specific estimates, the sum is done over a single value of \eqn{k}. \eqn{\hat{h}_o} at +#' the genome level is simply the mean of the locus estimates. +#' +#' @references Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press +#' +#' @param .x a [`gen_tibble`] (usually grouped, as obtained by using [dplyr::group_by()], otherwise the full tibble +#' will be considered as belonging to a single population). +#' @param by_locus boolean, determining whether Ho should be returned by locus(TRUE), +#' or as a single genome wide value (FALSE, the default). +#' @param include_global boolean determining whether, besides the population specific estiamtes, a global +#' estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value), +#' or a matrix with n+1 columns if `by_locus=TRUE`. +#' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()] +#' @returns a vector of mean population observed heterozygosities (if `by_locus=FALSE`), or a matrix of +#' estimates by locus (rows are loci, columns are populations, `by_locus=TRUE`) +#' @export + + +# adapted from hierfstat +pop_het_obs <- function(.x, by_locus = FALSE, include_global = FALSE, n_cores = bigstatsr::nb_cores()){ + stopifnot_diploid(.x) + # get the populations if it is a grouped gen_tibble + if (inherits(.x,"grouped_df")){ + .group_levels <- .x %>% group_keys() + .group_ids <- dplyr::group_indices(.x)-1 + } else { # create a dummy pop + .group_levels = tibble(population="pop") + .group_ids <- rep(0,nrow(.x)) + } + + # summarise population frequencies + pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, + rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = .group_ids, + ngroups = nrow(.group_levels), + ncores = n_cores) + + Ho <- pop_freqs_df$het_obs + colnames(Ho)<- .group_levels %>% dplyr::pull(1) + if (include_global){ + global <- (.x %>% pop_global_stats(by_locus = TRUE, n_cores = n_cores))$Ho + Ho <- cbind(Ho, global) + } + + if (by_locus){ + return(Ho) + } else { + return(colMeans(Ho, na.rm = TRUE)) + } +} diff --git a/R/utils.R b/R/utils.R index ed5e5f70..20a6b2da 100644 --- a/R/utils.R +++ b/R/utils.R @@ -24,6 +24,9 @@ tidy_dist_matrix<- function(mat){ # stop if not diploid stopifnot_diploid <- function(x){ + if (inherits(x, "gen_tbl")){ + x <- x$genotypes + } if (attr(x,"ploidy")!=2){ stop("this function only works on diploid data") } diff --git a/R/vcf_to_fbm_cpp.R b/R/vcf_to_fbm_cpp.R index cdf1f60f..c1ee1167 100644 --- a/R/vcf_to_fbm_cpp.R +++ b/R/vcf_to_fbm_cpp.R @@ -4,8 +4,9 @@ #' This should work even for large vcf files that would not fit in memory. #' #' @param vcf_path the path to the vcf -#' @param chunk_size the chunk size to use on the vcf when loading the file -#' @param backingfile the name of the file to use as the backing file +#' @param chunk_size the chunk size to use on the vcf when loading the file. If NULL, a best guess will be made. +#' @param backingfile the name of the file to use as the backing file for the FBM. If NULL, the vcf path will be used. +#' @param n_cores the number of cores to use when reading the vcf file. Default is 1. #' @return path to the resulting rds file as class bigSNP. #' @keywords internal @@ -13,10 +14,9 @@ vcf_to_fbm_cpp <- function( vcf_path, chunk_size = NULL, backingfile = NULL, + n_cores = 1, quiet=FALSE) { - n_cores <- 3 - if (is.null(backingfile)){ backingfile <- vcf_path backingfile <- sub("\\.vcf.gz$", "", backingfile) diff --git a/README.md b/README.md index bb11a444..bf8e1727 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![R-CMD-check](https://github.com/EvolEcolGroup/tidypopgen/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/EvolEcolGroup/tidypopgen/actions/workflows/R-CMD-check.yaml) +[![codecov](https://codecov.io/gh/EvolEcolGroup/tidypopgen/branch/main/graph/badge.svg?token=KLOzxJoLBO)](https://app.codecov.io/gh/EvolEcolGroup/tidypopgen) The goal of `tidypopgen` is to provide a tidy grammar of population genetics, facilitating diff --git a/man/gen_tibble.Rd b/man/gen_tibble.Rd index b86be381..b1dd9fd5 100644 --- a/man/gen_tibble.Rd +++ b/man/gen_tibble.Rd @@ -19,6 +19,7 @@ gen_tibble( x, ..., parser = c("vcfR", "cpp"), + n_cores = 1, chunk_size = NULL, valid_alleles = c("A", "T", "C", "G"), missing_alleles = c("0", "."), @@ -83,6 +84,8 @@ a fast C++ parser, or "vcfR" to use the R package \code{vcfR}. The latter is slo but more robust; if "cpp" gives error, try using "vcfR" in case your VCF has an unusual structure.} +\item{n_cores}{the number of cores to use for parallel processing} + \item{chunk_size}{the number of loci or individuals (depending on the format) processed at a time (currently used if \code{x} is a vcf or packedancestry file)} diff --git a/man/gt_roh_window.Rd b/man/gt_roh_window.Rd index a569ba0b..441d8801 100644 --- a/man/gt_roh_window.Rd +++ b/man/gt_roh_window.Rd @@ -74,8 +74,8 @@ group table. Otherwise, the group 'column' is filled with the same values as the column } \examples{ -# don't run the example -if (FALSE) { +# run the example only if we have the package installed +if (requireNamespace("detectRUNS", quietly = TRUE)) { sheep_ped <- system.file("extdata", "Kijas2016_Sheep_subset.ped", package="detectRUNS") sheep_gt <- tidypopgen::gen_tibble(sheep_ped, backingfile = tempfile(), diff --git a/man/indiv_het_obs.Rd b/man/indiv_het_obs.Rd index ff5da3dd..4c6c6cfb 100644 --- a/man/indiv_het_obs.Rd +++ b/man/indiv_het_obs.Rd @@ -4,7 +4,6 @@ \alias{indiv_het_obs} \alias{indiv_het_obs.tbl_df} \alias{indiv_het_obs.vctrs_bigSNP} -\alias{indiv_het_obs.grouped_df} \title{Estimate individual observed heterozygosity} \usage{ indiv_het_obs(.x, ...) @@ -12,8 +11,6 @@ indiv_het_obs(.x, ...) \method{indiv_het_obs}{tbl_df}(.x, ...) \method{indiv_het_obs}{vctrs_bigSNP}(.x, ...) - -\method{indiv_het_obs}{grouped_df}(.x, ...) } \arguments{ \item{.x}{a vector of class \code{vctrs_bigSNP} (usually the \code{genotype} column of diff --git a/man/indiv_missingness.Rd b/man/indiv_missingness.Rd index 140f610f..ebec8198 100644 --- a/man/indiv_missingness.Rd +++ b/man/indiv_missingness.Rd @@ -4,7 +4,6 @@ \alias{indiv_missingness} \alias{indiv_missingness.tbl_df} \alias{indiv_missingness.vctrs_bigSNP} -\alias{indiv_missingness.grouped_df} \title{Estimate individual missingness} \usage{ indiv_missingness(.x, as_counts = FALSE, ...) @@ -12,8 +11,6 @@ indiv_missingness(.x, as_counts = FALSE, ...) \method{indiv_missingness}{tbl_df}(.x, as_counts = FALSE, ...) \method{indiv_missingness}{vctrs_bigSNP}(.x, as_counts = FALSE, ...) - -\method{indiv_missingness}{grouped_df}(.x, as_counts = FALSE, ...) } \arguments{ \item{.x}{a vector of class \code{vctrs_bigSNP} (usually the \code{genotype} column of diff --git a/man/indiv_ploidy.Rd b/man/indiv_ploidy.Rd index f67ba4e3..7210b697 100644 --- a/man/indiv_ploidy.Rd +++ b/man/indiv_ploidy.Rd @@ -4,7 +4,6 @@ \alias{indiv_ploidy} \alias{indiv_ploidy.tbl_df} \alias{indiv_ploidy.vctrs_bigSNP} -\alias{indiv_ploidy.grouped_df} \title{Return individual ploidy} \usage{ indiv_ploidy(.x, ...) @@ -12,8 +11,6 @@ indiv_ploidy(.x, ...) \method{indiv_ploidy}{tbl_df}(.x, ...) \method{indiv_ploidy}{vctrs_bigSNP}(.x, ...) - -\method{indiv_ploidy}{grouped_df}(.x, ...) } \arguments{ \item{.x}{a \code{\link{gen_tibble}}, or a vector of class \code{vctrs_bigSNP} (usually the \code{genotype} column of diff --git a/man/pop_fis.Rd b/man/pop_fis.Rd index 0c41ba4c..5a162189 100644 --- a/man/pop_fis.Rd +++ b/man/pop_fis.Rd @@ -4,15 +4,28 @@ \alias{pop_fis} \title{Compute population specific FIS} \usage{ -pop_fis(.x, include_global = FALSE, allele_sharing_mat = NULL) +pop_fis( + .x, + method = c("Nei87", "WG17"), + by_locus = FALSE, + include_global = FALSE, + allele_sharing_mat = NULL +) } \arguments{ \item{.x}{a grouped \code{\link{gen_tibble}} (as obtained by using \code{\link[dplyr:group_by]{dplyr::group_by()}})} -\item{include_global}{boolean determining whether, besides the population specific fis, a global -fis should be appended. Note that this will return a vector of n populations plus 1 (the global value)} +\item{method}{one of "Nei87" (based on Nei 1987, eqn 7.41) or "WG17" (for Weir and Goudet 2017) to compute FIS} -\item{allele_sharing_mat}{optional, the matrix of Allele Sharing returned by +\item{by_locus}{boolean, determining whether FIS should be returned by locus(TRUE), +or as a single genome wide value (FALSE, the default). Note that this is only relevant for "Nei87", +as "WG17" always returns a single value.} + +\item{include_global}{boolean determining whether, besides the population specific estiamtes, a global +estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value), +or a matrix with n+1 columns if \code{by_locus=TRUE}.} + +\item{allele_sharing_mat}{optional and only relevant for "WG17", the matrix of Allele Sharing returned by \code{\link[=pairwise_allele_sharing]{pairwise_allele_sharing()}} with \code{as_matrix=TRUE}. As a number of statistics can be derived from the Allele Sharing matrix, it it sometimes more efficient to pre-compute this matrix.} @@ -21,5 +34,10 @@ it it sometimes more efficient to pre-compute this matrix.} a vector of population specific fis (plus the global value if \code{include_global=TRUE}) } \description{ -This function computes population specific FIS (as computed by \code{\link[hierfstat:fs.dosage]{hierfstat::fis.dosage()}}). +This function computes population specific FIS, using either the approach of Nei 1987 (as computed by \code{\link[hierfstat:basic.stats]{hierfstat::basic.stats()}}) or of Weir and Goudet 2017 +(as computed by \code{\link[hierfstat:fs.dosage]{hierfstat::fis.dosage()}}). +} +\references{ +Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press +Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085 } diff --git a/man/pop_fst.Rd b/man/pop_fst.Rd index f606c26c..a432aff1 100644 --- a/man/pop_fst.Rd +++ b/man/pop_fst.Rd @@ -20,5 +20,9 @@ derived from the Allele Sharing matrix,} a vector of population specific Fst (plus the global value if \code{include_global=TRUE}) } \description{ -This function computes population specific Fst (as computed by \code{\link[hierfstat:fs.dosage]{hierfstat::fst.dosage()}}). +This function computes population specific Fst, using the approach in Weir and Goudet 2017 +(as computed by \code{\link[hierfstat:fs.dosage]{hierfstat::fst.dosage()}}). +} +\references{ +Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085 } diff --git a/man/pop_global_stats.Rd b/man/pop_global_stats.Rd new file mode 100644 index 00000000..5449f96a --- /dev/null +++ b/man/pop_global_stats.Rd @@ -0,0 +1,86 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pop_global_stats.R +\name{pop_global_stats} +\alias{pop_global_stats} +\title{Compute basic population global statistics} +\usage{ +pop_global_stats(.x, by_locus = FALSE, n_cores = bigstatsr::nb_cores()) +} +\arguments{ +\item{.x}{a \code{\link{gen_tibble}} (usually grouped, as obtained by using \code{\link[dplyr:group_by]{dplyr::group_by()}}; use on +a single population will return a number of quantities as NA/NaN)} + +\item{by_locus}{boolean, determining whether the statistics should be returned by locus(TRUE), +or as a single genome wide value (FALSE, the default).} + +\item{n_cores}{number of cores to be used, it defaults to \code{\link[bigstatsr:reexports]{bigstatsr::nb_cores()}}} +} +\value{ +a tibble of population statistics, with populations as rows and statistics as columns +} +\description{ +This function computes basic population global statistics, following the notation in Nei 1987 (which in turn is based on Nei and Chesser 1983): +\itemize{ +\item observed heterozygosity ( \eqn{\hat{h}_o}, column header \code{Ho}) +\item expected heterozygosity, also known as gene diversity ( \eqn{\hat{h}_s}, \code{Hs}) +\item total heterozygosity ( \eqn{\hat{h}_t}, \code{Ht}) +\item genetic differentiation between subpopulations (\eqn{D_{st}}, \code{Dst}) +\item corrected total population diversity (\eqn{h'_t}, \code{Htp}) +\item corrected genetic differentiation between subpopulations (\eqn{D'_{st}}, \code{Dstp}) +\item \eqn{\hat{F}_{ST}} (column header, \code{Fst}) +\item corrected \eqn{\hat{F'}_{ST}} (column header \code{Fstp}) +\item \eqn{\hat{F}_{IS}} (column header, \code{Fis}) +\item Jost's \eqn{\hat{D}} (column header, \code{Dest}) +} +} +\details{ +We use the notation of Nei 1987. That notation was for loci with \eqn{m} alleles, but in our case we only have two alleles, so \code{m=2}. +\itemize{ +\item Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr +\eqn{\hat{h}_o= 1-\sum_{k=1}^{s} \sum_{i=1}^{m} \hat{X}_{kii}/s}\cr +where\cr +\eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr +\eqn{s} the number of populations,\cr +following equation 7.38 in Nei(1987) on pp.164.\cr +\item Within population expected heterozygosity (gene diversity) \eqn{\hat{h}_s} for a locus with \eqn{m} alleles is defined as:\cr +\eqn{\hat{h}_s=(\tilde{n}/(\tilde{n}-1))[1-\sum_{i=1}^{m}\bar{\hat{x}_i^2}-\hat{h}_o/2\tilde{n}]}\cr +where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) and\cr +\eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr +following equation 7.39 in Nei(1987) on pp.164. +\item Total heterozygosity (total gene diversity) \eqn{\hat{h}_t} for a locus with \eqn{m} alleles is defined as:\cr +\eqn{\hat{h}_t = 1-\sum_{i=1}^{m} \bar{\hat{x}_i^2} + \hat{h}_s/(\tilde{n}s) - \hat{h}_o/(2\tilde{n}s)}\cr +where \cr +\eqn{\hat{x}_i=\sum_k \hat{x}_{ki}/s}\cr +following equation 7.40 in Nei(1987) on pp.164.\cr +\item The amount of gene diversity among samples \eqn{D_{ST}} is defined as:\cr +\eqn{D_{ST} = \hat{h}_t - \hat{h}_s}\cr +following the equation provided in the text at the top of page 165 in Nei(1987). +\item The corrected amount of gene diversity among samples \eqn{D'_{ST}} is defined as:\cr +\eqn{D'_{ST} = (s/(s-1))D'_{ST}}\cr +following the equation provided in the text at the top of page 165 in Nei(1987). +\item Total corrected heterozygosity (total gene diversity) \eqn{\hat{h}_t} is defined as:\cr +\eqn{\hat{h'}_t = \hat{h}_s + D'_{ST}}\cr +following the equation provided in the text at the top of page 165 in Nei(1987). +\item \eqn{\hat{F}_{IS}} is defined as:\cr +\eqn{\hat{F}_{IS} = 1 - \hat{h}_o/\hat{h}_s}\cr +following equation 7.41 in Nei(1987) on pp.164.\cr +\item \eqn{\hat{F}_{ST}} is defined as:\cr +\eqn{\hat{F}_{ST} = 1 - \hat{h}_s/\hat{h}_t = D_{ST}/\hat{h}_t}\cr +following equation 7.43 in Nei(1987) on pp.165.\cr +\item \eqn{\hat{F'}_{ST}} is defined as:\cr +\eqn{\hat{F'}_{ST} = D'_{ST}/\hat{h'}_t}\cr +following the explanation provided in the text at the top of page 165 in Nei(1987). +\item Jost's \eqn{\hat{D}} is defined as:\cr +\eqn{\hat{D} = (s/(s-1))((\hat{h'}_t-\hat{h}_s)/(1-\hat{h}_s))}\cr +as defined by Jost(2008) +} + +All these statistics are first computed by locus, and then averaged across loci (including any +monorphic locus) to obtain genome-wide values. The function uses the same algorithm as +\code{\link[hierfstat:basic.stats]{hierfstat::basic.stats()}} but is optimized for speed and memory usage. +} +\references{ +Nei M, Chesser R (1983) Estimation of fixation indexes and gene diversities. Annals of Human Genetics, 47, 253-259. +Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press, pp. 164-165. +Jost L (2008) GST and its relatives do not measure differentiation. Molecular Ecology, 17, 4015-4026. +} diff --git a/man/pop_het_exp.Rd b/man/pop_het_exp.Rd new file mode 100644 index 00000000..3d674f53 --- /dev/null +++ b/man/pop_het_exp.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pop_het_exp.R +\name{pop_het_exp} +\alias{pop_het_exp} +\alias{pop_gene_div} +\title{Compute the population expected heterozygosity} +\usage{ +pop_het_exp( + .x, + by_locus = FALSE, + include_global = FALSE, + n_cores = bigstatsr::nb_cores() +) + +pop_gene_div( + .x, + by_locus = FALSE, + include_global = FALSE, + n_cores = bigstatsr::nb_cores() +) +} +\arguments{ +\item{.x}{a \code{\link{gen_tibble}} (usually grouped, as obtained by using \code{\link[dplyr:group_by]{dplyr::group_by()}}, otherwise the full tibble +will be considered as belonging to a single population).} + +\item{by_locus}{boolean, determining whether Hs should be returned by locus(TRUE), +or as a single genome wide value (FALSE, the default).} + +\item{include_global}{boolean determining whether, besides the population specific estiamtes, a global +estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value), +or a matrix with n+1 columns if \code{by_locus=TRUE}.} + +\item{n_cores}{number of cores to be used, it defaults to \code{\link[bigstatsr:reexports]{bigstatsr::nb_cores()}}} +} +\value{ +a vector of mean population observed heterozygosities (if \code{by_locus=FALSE}), or a matrix of +estimates by locus (rows are loci, columns are populations, \code{by_locus=TRUE}) +} +\description{ +This function computes expected population heterozygosity (also +referred to as gene diversity, to avoid the potentially misleading use of the term "expected" in this context), using the formula of Nei (1987). +} +\details{ +Within population expected heterozygosity (gene diversity) \eqn{\hat{h}_s} for a locus with \eqn{m} alleles is defined as:\cr +\eqn{\hat{h}_s=\tilde{n}/(\tilde{n}-1)[1-\sum_{i}^{m}\bar{\hat{x}_i^2}-\hat{h}_o/2\tilde{n}]}\cr + +where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) and\cr +\eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr +following equation 7.39 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. \eqn{\hat{h}_s} at +the genome level for each population is simply the mean of the locus estimates for each population. +} +\references{ +Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press +} diff --git a/man/pop_het_obs.Rd b/man/pop_het_obs.Rd new file mode 100644 index 00000000..c5848578 --- /dev/null +++ b/man/pop_het_obs.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pop_het_obs.R +\name{pop_het_obs} +\alias{pop_het_obs} +\title{Compute the population observed heterozygosity} +\usage{ +pop_het_obs( + .x, + by_locus = FALSE, + include_global = FALSE, + n_cores = bigstatsr::nb_cores() +) +} +\arguments{ +\item{.x}{a \code{\link{gen_tibble}} (usually grouped, as obtained by using \code{\link[dplyr:group_by]{dplyr::group_by()}}, otherwise the full tibble +will be considered as belonging to a single population).} + +\item{by_locus}{boolean, determining whether Ho should be returned by locus(TRUE), +or as a single genome wide value (FALSE, the default).} + +\item{include_global}{boolean determining whether, besides the population specific estiamtes, a global +estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value), +or a matrix with n+1 columns if \code{by_locus=TRUE}.} + +\item{n_cores}{number of cores to be used, it defaults to \code{\link[bigstatsr:reexports]{bigstatsr::nb_cores()}}} +} +\value{ +a vector of mean population observed heterozygosities (if \code{by_locus=FALSE}), or a matrix of +estimates by locus (rows are loci, columns are populations, \code{by_locus=TRUE}) +} +\description{ +This function computes population heterozygosity, using the formula of Nei (1987). +} +\details{ +Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr +\eqn{\hat{h}_o= 1-\sum_{k=1}^{s} \sum_{i=1}^{m} \hat{X}_{kii}/s}\cr +where\cr +\eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr +\eqn{s} the number of populations,\cr +following equation 7.38 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. For +population specific estimates, the sum is done over a single value of \eqn{k}. \eqn{\hat{h}_o} at +the genome level is simply the mean of the locus estimates. +} +\references{ +Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press +} diff --git a/man/vcf_to_fbm_cpp.Rd b/man/vcf_to_fbm_cpp.Rd index 3e5878a4..ee02aeb9 100644 --- a/man/vcf_to_fbm_cpp.Rd +++ b/man/vcf_to_fbm_cpp.Rd @@ -4,14 +4,22 @@ \alias{vcf_to_fbm_cpp} \title{Convert vcf to FBM.} \usage{ -vcf_to_fbm_cpp(vcf_path, chunk_size = NULL, backingfile = NULL, quiet = FALSE) +vcf_to_fbm_cpp( + vcf_path, + chunk_size = NULL, + backingfile = NULL, + n_cores = 1, + quiet = FALSE +) } \arguments{ \item{vcf_path}{the path to the vcf} -\item{chunk_size}{the chunk size to use on the vcf when loading the file} +\item{chunk_size}{the chunk size to use on the vcf when loading the file. If NULL, a best guess will be made.} -\item{backingfile}{the name of the file to use as the backing file} +\item{backingfile}{the name of the file to use as the backing file for the FBM. If NULL, the vcf path will be used.} + +\item{n_cores}{the number of cores to use when reading the vcf file. Default is 1.} } \value{ path to the resulting rds file as class bigSNP. diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R index a76451bb..51c55148 100644 --- a/tests/testthat/test_gen_tibble.R +++ b/tests/testthat/test_gen_tibble.R @@ -190,7 +190,7 @@ test_that("gen_tibble from files",{ expect_true(all.equal(show_genotypes(pop_a_gt),show_genotypes(pop_a_vcf_gt))) # reload it in chunks pop_a_vcf_gt2 <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), - chunk_size = 2) + chunk_size = 2, parser="vcfR") expect_true(all.equal(show_genotypes(pop_a_vcf_gt2),show_genotypes(pop_a_vcf_gt))) expect_true(all.equal(show_loci(pop_a_vcf_gt2),show_loci(pop_a_vcf_gt))) expect_true(is.integer(show_loci(pop_a_vcf_gt)$chr_int)) @@ -235,11 +235,12 @@ test_that("gen_tibble from files with missingness",{ # PLINK VCF files ######################## vcf_path <- system.file("extdata/pop_b.vcf", package = "tidypopgen") - pop_b_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile()) + pop_b_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), + parser="vcfR") expect_true(all.equal(show_genotypes(pop_b_gt),show_genotypes(pop_b_vcf_gt))) # reload it in chunks pop_b_vcf_gt2 <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), - chunk_size = 2) + chunk_size = 2, parser = "vcfR") expect_true(all.equal(show_genotypes(pop_b_vcf_gt2),show_genotypes(pop_b_vcf_gt))) expect_true(all.equal(show_loci(pop_b_vcf_gt2),show_loci(pop_b_vcf_gt))) expect_true(is.integer(show_loci(pop_b_vcf_gt2)$chr_int)) @@ -549,11 +550,11 @@ test_that("gt without population is valid",{ gt_load(file_names[1]) # will fail in functions that require grouping - expect_error(pop_fis(test_gt), ".x should be a grouped df") + expect_error(pop_fst(test_gt), ".x should be a grouped gen_tibble") # functions that require grouping work after adding groups test_gt$groups <- c("A","A","B") test_gt <- test_gt %>% group_by(groups) - expect_equal(unname(pop_fis(test_gt)), c(-0.25, NaN)) + expect_equal(unname(pop_fst(test_gt)), c(-0, NaN)) # gen_tbl from vcf does not have population automatically vcf_path <- system.file("extdata/pop_b.vcf", package = "tidypopgen") @@ -562,6 +563,21 @@ test_that("gt without population is valid",{ }) +test_that("additional vcf tests with larger file",{ + vcf_path <- system.file("/extdata/anolis/punctatus_t70_s10_n46_filtered.recode.vcf.gz", + package = "tidypopgen") + anole_gt <- gen_tibble(vcf_path, quiet = TRUE, parser = "cpp", backingfile = tempfile("anolis_")) + anole_gt_vcfr <- gen_tibble(vcf_path, quiet = TRUE, parser = "vcfR", + backingfile = tempfile("anolis_")) + expect_true(all.equal(show_genotypes(anole_gt),show_genotypes(anole_gt_vcfr))) + anole_gt2 <- gen_tibble(vcf_path, quiet = TRUE, parser = "cpp", backingfile = tempfile("anolis_"), + chunk_size = 1000, n_cores = 2) + expect_true(all.equal(show_genotypes(anole_gt2),show_genotypes(anole_gt_vcfr))) +} +) + + + # Windows prevents the deletion of the backing file. It's something to do with the memory mapping # library used by bigsnpr # test_that("on error, we remove the old files",{ diff --git a/tests/testthat/test_pop_basic_stats.R b/tests/testthat/test_pop_basic_stats.R new file mode 100644 index 00000000..78af26a1 --- /dev/null +++ b/tests/testthat/test_pop_basic_stats.R @@ -0,0 +1,88 @@ +test_genotypes <- rbind(c(1,1,0,1,1,0), + c(2,1,0,NA,0,0), + c(2,NA,0,0,1,1), + c(1,0,0,1,0,0), + c(1,2,0,1,2,1), + c(0,0,0,0,NA,1), + c(0,1,1,0,1,NA)) +test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"), + population = c("pop1","pop1","pop2","pop2","pop1","pop3","pop3")) +test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=paste0("chr",c(1,1,1,1,2,2)), + position=as.integer(c(3,5,65,343,23,456)), + genetic_dist = as.integer(rep(0,6)), + allele_ref = c("A","T","C","G","C","T"), + allele_alt = c("T","C", NA,"C","G","A")) + +test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE) + +test_that("pop_* basic stats functions work correctly",{ + test_gt <- test_gt %>% dplyr::group_by(population) + test_hier <- gt_as_hierfstat(test_gt) + basic_hier <- hierfstat::basic.stats(test_hier) + + # observed heterozygosity by locus + test_het_obs <- test_gt %>% pop_het_obs(by_locus = TRUE) + expect_true(all.equal(basic_hier$Ho, round(test_het_obs, 4), check.attributes=FALSE)) + # overall (mean of by locus values) + test_het_obs <- test_gt %>% pop_het_obs(by_locus = FALSE) + expect_true(all.equal(colMeans(basic_hier$Ho, na.rm=TRUE), test_het_obs, check.attributes=FALSE)) + # check the overall value + test_het_obs <- test_gt %>% pop_het_obs(by_locus = TRUE, include_global = TRUE) + expect_true(all.equal(round(test_het_obs[,4], 4), basic_hier$perloc$Ho, check.attributes=FALSE)) + test_het_obs <- test_gt %>% pop_het_obs(by_locus = FALSE, include_global = TRUE) + expect_true(all.equal(round(test_het_obs[4], 4), basic_hier$overall["Ho"], check.attributes=FALSE)) + + # test expected heterozygosity by locus + test_het_exp <- test_gt %>% pop_het_exp(by_locus = TRUE) + expect_true(all.equal(basic_hier$Hs, round(test_het_exp, 4), check.attributes=FALSE)) + # overall (mean of by locus values) + test_het_exp <- test_gt %>% pop_het_exp(by_locus = FALSE) + expect_true(all.equal(colMeans(basic_hier$Hs, na.rm=TRUE), test_het_exp, check.attributes=FALSE)) + # check the overall value + test_het_exp <- test_gt %>% pop_het_exp(by_locus = TRUE, include_global = TRUE) + expect_true(all.equal(round(test_het_exp[,4], 4), basic_hier$perloc$Hs, check.attributes=FALSE)) + test_het_exp <- test_gt %>% pop_het_exp(by_locus = FALSE, include_global = TRUE) + expect_true(all.equal(round(test_het_exp[4], 4), basic_hier$overall["Hs"], check.attributes=FALSE)) + + # test Fis by locus + test_fis <- test_gt %>% pop_fis(by_locus = TRUE, method= "Nei87") + expect_true(all.equal(basic_hier$Fis, round(test_fis, 4), check.attributes=FALSE)) + # overall (mean of by locus values) + test_fis <- test_gt %>% pop_fis(by_locus = FALSE) + expect_true(all.equal(round(colMeans(basic_hier$Fis, na.rm=TRUE),4), round(test_fis,4), check.attributes=FALSE)) + # check the overall value + test_fis <- test_gt %>% pop_fis(by_locus = TRUE, include_global = TRUE) + expect_true(all.equal(round(test_fis[,4], 4), basic_hier$perloc$Fis, check.attributes=FALSE)) + test_fis <- test_gt %>% pop_fis(by_locus = FALSE, include_global = TRUE) + expect_true(all.equal(round(test_fis[4], 4), basic_hier$overall["Fis"], check.attributes=FALSE)) + + # test global stats by locus + test_global_stats <- test_gt %>% pop_global_stats(by_locus = TRUE) + expect_true(all.equal(basic_hier$perloc, round(test_global_stats, 4), check.attributes=FALSE)) + # test global stats overall + test_global_stats <- test_gt %>% pop_global_stats(by_locus = FALSE) + expect_true(all.equal(basic_hier$overall, round(test_global_stats, 4), check.attributes=FALSE)) + + # test Fis errors for incorrect parameters for Nei87 + expect_error(test_gt %>% pop_fis(by_locus = TRUE, method= "Nei87", allele_sharing_mat = matrix(1, nrow=7, ncol=7)), + "allele_sharing_mat not relevant for Nei87") + # test Fis errors for incorrect parameters for WG17 + expect_error(test_gt %>% pop_fis(by_locus = TRUE, method= "WG17", allele_sharing_mat = matrix(1, nrow=7, ncol=7)), + "by_locus not implemented for WG17") + # test alias for pop_het_exp + expect_identical(test_het_exp, test_gt %>% pop_gene_div(by_locus = FALSE, include_global = TRUE)) + +}) + +test_that("pop_* basic stats work on a single population",{ + test_het_obs <- test_gt %>% pop_het_obs(by_locus = TRUE) + expect_true(ncol(test_het_obs) == 1) # just one column, as we only have one population + test_fis <- test_gt %>% pop_fis(by_locus = TRUE) + expect_true(ncol(test_fis) == 1) # just one column, as we only have one population + test_het_exp <- test_gt %>% pop_het_exp(by_locus = TRUE) + expect_true(ncol(test_het_exp) == 1) # just one column, as we only have one population + test_global_stats <- test_gt %>% pop_global_stats(by_locus = TRUE) + expect_true(all(is.nan(test_global_stats$Fstp))) # Fstp is not defined for a single population + +}) diff --git a/tests/testthat/test_pop_fis_fst.R b/tests/testthat/test_pop_fis_fst.R index 017dfdfd..48b910f9 100644 --- a/tests/testthat/test_pop_fis_fst.R +++ b/tests/testthat/test_pop_fis_fst.R @@ -17,19 +17,25 @@ test_loci <- data.frame(name=paste0("rs",1:6), test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE) -test_that("pop_fst and pop_fist compute correctly",{ +test_that("pop_fst and pop_fist WG17 compute correctly",{ + # expect error if the tibble is not grouped + expect_error(test_gt %>% pop_fis(method="WG17"), + ".x should be a grouped gen_tibble") + + expect_error(test_gt %>% pop_fst(), + ".x should be a grouped gen_tibble") + + # group the tibble test_gt <- test_gt %>% dplyr::group_by(population) # compare results against raw hierfstat code - fis_by_pop <- test_gt %>% pop_fis(include_global=TRUE) + fis_by_pop <- test_gt %>% pop_fis(include_global=TRUE, method = "WG17") fis_by_pop_hier <- hierfstat::fis.dosage(test_genotypes,pop=test_indiv_meta$population) expect_true(all.equal(fis_by_pop, fis_by_pop_hier,check.attributes = FALSE)) # now check that we don't get the global - fis_by_pop_sub <- test_gt %>% pop_fis() + fis_by_pop_sub <- test_gt %>% pop_fis(method = "WG17") expect_true (all.equal(fis_by_pop[-length(fis_by_pop)], fis_by_pop_sub)) - - # compare results against raw hierfstat code fst_by_pop <- test_gt %>% pop_fst(include_global=TRUE) fst_by_pop_hier <- hierfstat::fst.dosage(test_genotypes,pop=test_indiv_meta$population) diff --git a/tests/testthat/test_q_matrix.R b/tests/testthat/test_q_matrix.R index d6f5c273..3f29dc56 100644 --- a/tests/testthat/test_q_matrix.R +++ b/tests/testthat/test_q_matrix.R @@ -15,9 +15,7 @@ test_that("q_matrix reads q_files",{ expect_true(inherits(anolis_q_k3_mat,"q_matrix")) #read multiple .Q - package_path <- find.package("tidypopgen") - folder <- "/extdata/anolis" - q_folder <- file.path(package_path, folder) + q_folder <- system.file("/extdata/anolis", package = "tidypopgen") anolis_q <- q_matrix(q_folder) expect_true(inherits(anolis_q,"q_matrix_list")) expect_true(inherits(anolis_q[[1]][[1]],"q_matrix"))