Skip to content

Commit

Permalink
Merge pull request #56 from EvolEcolGroup/sum_stats
Browse files Browse the repository at this point in the history
Sum stats
  • Loading branch information
dramanica authored Oct 19, 2024
2 parents f8610e8 + c71fcf3 commit 1a19203
Show file tree
Hide file tree
Showing 31 changed files with 755 additions and 88 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tidypopgen
Title: Tidy Population Genetics
Version: 0.0.0.9017
Version: 0.0.0.9018
Authors@R:
c(person("Evie", "Carter", role = c("aut")),
person("Andrea", "Manica", email = "am315@cam.ac.uk",
Expand Down
7 changes: 4 additions & 3 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,10 @@ S3method(dplyr_row_slice,grouped_gen_tbl)
S3method(gen_tibble,character)
S3method(gen_tibble,matrix)
S3method(group_by,gen_tbl)
S3method(indiv_het_obs,grouped_df)
S3method(indiv_het_obs,tbl_df)
S3method(indiv_het_obs,vctrs_bigSNP)
S3method(indiv_missingness,grouped_df)
S3method(indiv_missingness,tbl_df)
S3method(indiv_missingness,vctrs_bigSNP)
S3method(indiv_ploidy,grouped_df)
S3method(indiv_ploidy,tbl_df)
S3method(indiv_ploidy,vctrs_bigSNP)
S3method(loci_alt_freq,grouped_df)
Expand Down Expand Up @@ -124,6 +121,10 @@ export(pairwise_king)
export(pairwise_pop_fst)
export(pop_fis)
export(pop_fst)
export(pop_gene_div)
export(pop_global_stats)
export(pop_het_exp)
export(pop_het_obs)
export(q_matrix)
export(qc_report_indiv)
export(qc_report_loci)
Expand Down
3 changes: 3 additions & 0 deletions R/gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#' a fast C++ parser, or "vcfR" to use the R package `vcfR`. The latter is slower
#' but more robust; if "cpp" gives error, try using "vcfR" in case your VCF has
#' an unusual structure.
#' @param n_cores the number of cores to use for parallel processing
#' @param valid_alleles a vector of valid allele values; it defaults to 'A','T',
#' 'C' and 'G'.
#' @param missing_alleles a vector of values in the BIM file/loci dataframe that
Expand Down Expand Up @@ -76,6 +77,7 @@ gen_tibble.character <-
function(x,
...,
parser = c("vcfR","cpp"),
n_cores = 1,
chunk_size = NULL,
valid_alleles = c("A", "T", "C", "G"),
missing_alleles = c("0","."),
Expand Down Expand Up @@ -107,6 +109,7 @@ gen_tibble.character <-
quiet = quiet)
} else if ((tolower(file_ext(x))=="vcf") || (tolower(file_ext(x))=="gz")){
return(gen_tibble_vcf(x = x, ..., parser = parser, chunk_size = chunk_size,
n_cores = n_cores,
valid_alleles= valid_alleles,
missing_alleles= missing_alleles,
backingfile = backingfile, quiet = quiet))
Expand Down
20 changes: 0 additions & 20 deletions R/gt_helper_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,3 @@
.gt_get_bigsnp<-function(.x){
attr(.x$genotypes,"bigsnp")
}


# a developer function to create various count summaries of a population, used to
# compute more complex statistics (e.g. pairwise fst, etc.).
.gt_pop_freqs <- function(.x){
counts <- bigstatsr::big_counts( .gt_get_bigsnp(.x)$genotypes,
ind.row =.gt_bigsnp_rows(.x),
ind.col = .gt_bigsnp_cols(.x))
sums_alt <- apply(counts,2,function(x) x[2]+2*x[3])
n <- apply(counts,2,function(x) sum(x[1:3])*2)
sums_ref <- n - sums_alt
freq_alt <- sums_alt/n
freq_ref <- 1- freq_alt
het_obs <- apply(counts,2,function(x) x[2]/sum(x[1:3]))
return (list(
freq_alt = freq_alt,
freq_ref = freq_ref,
n = n,
het_obs = het_obs))
}
4 changes: 2 additions & 2 deletions R/gt_roh_window.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@
#' @export
#'
#' @examples
#' # don't run the example
#' if (FALSE) {
#' # run the example only if we have the package installed
#' if (requireNamespace("detectRUNS", quietly = TRUE)) {
#' sheep_ped <- system.file("extdata", "Kijas2016_Sheep_subset.ped",
#' package="detectRUNS")
#' sheep_gt <- tidypopgen::gen_tibble(sheep_ped, backingfile = tempfile(),
Expand Down
10 changes: 5 additions & 5 deletions R/indiv_het_obs.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ indiv_het_obs.vctrs_bigSNP <- function(.x, ...){
this_col_1_na[1,]/(ncol(X)-this_col_1_na[2,])
}

#' @export
#' @rdname indiv_het_obs
indiv_het_obs.grouped_df <- function(.x, ...){
.x %>% mutate(indiv_het_obs = indiv_het_obs(.data$genotypes)) %>% summarise(het_obs = mean(indiv_het_obs))
}
# #' @export
# #' @rdname indiv_het_obs
# indiv_het_obs.grouped_df <- function(.x, ...){
# .x %>% mutate(indiv_het_obs = indiv_het_obs(.data$genotypes)) %>% summarise(het_obs = mean(indiv_het_obs))
# }
12 changes: 6 additions & 6 deletions R/indiv_missingness.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ indiv_missingness.vctrs_bigSNP <- function(.x, as_counts = FALSE, ...){
row_na
}

#' @export
#' @rdname indiv_missingness
indiv_missingness.grouped_df <- function(.x, as_counts = FALSE, ...){
.x %>% mutate(indiv_missingness = indiv_missingness(.data$genotypes, as_counts = as_counts)) %>%
summarise(het_obs = mean(indiv_missingness))
}
#' #' @export
#' #' @rdname indiv_missingness
#' indiv_missingness.grouped_df <- function(.x, as_counts = FALSE, ...){
#' .x %>% mutate(indiv_missingness = indiv_missingness(.data$genotypes, as_counts = as_counts)) %>%
#' summarise(het_obs = mean(indiv_missingness))
#' }
12 changes: 6 additions & 6 deletions R/indiv_ploidy.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ indiv_ploidy.vctrs_bigSNP <- function(.x, ...){
}
}

#' @export
#' @rdname indiv_ploidy
indiv_ploidy.grouped_df <- function(.x, ...){
.x %>% mutate(indiv_ploidy = indiv_ploidy(.data$genotypes)) %>%
summarise(ploidy = mean(indiv_ploidy))
}
#' #' @export
#' #' @rdname indiv_ploidy
#' indiv_ploidy.grouped_df <- function(.x, ...){
#' .x %>% mutate(indiv_ploidy = indiv_ploidy(.data$genotypes)) %>%
#' summarise(ploidy = mean(indiv_ploidy))
#' }
82 changes: 76 additions & 6 deletions R/pop_fis.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,89 @@
#' Compute population specific FIS
#'
#' This function computes population specific FIS (as computed by [hierfstat::fis.dosage()]).
#' This function computes population specific FIS, using either the approach of Nei 1987 (as computed by [hierfstat::basic.stats()]) or of Weir and Goudet 2017
#' (as computed by [hierfstat::fis.dosage()]).
#' @references
#' Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press
#' Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085
#' @param .x a grouped [`gen_tibble`] (as obtained by using [dplyr::group_by()])
#' @param include_global boolean determining whether, besides the population specific fis, a global
#' fis should be appended. Note that this will return a vector of n populations plus 1 (the global value)
#' @param allele_sharing_mat optional, the matrix of Allele Sharing returned by
#' @param method one of "Nei87" (based on Nei 1987, eqn 7.41) or "WG17" (for Weir and Goudet 2017) to compute FIS
#' @param by_locus boolean, determining whether FIS should be returned by locus(TRUE),
#' or as a single genome wide value (FALSE, the default). Note that this is only relevant for "Nei87",
#' as "WG17" always returns a single value.
#' @param include_global boolean determining whether, besides the population specific estiamtes, a global
#' estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value),
#' or a matrix with n+1 columns if `by_locus=TRUE`.
#' @param allele_sharing_mat optional and only relevant for "WG17", the matrix of Allele Sharing returned by
#' [pairwise_allele_sharing()] with `as_matrix=TRUE`. As a number of statistics can be
#' derived from the Allele Sharing matrix,
#' it it sometimes more efficient to pre-compute this matrix.
#' @returns a vector of population specific fis (plus the global value if `include_global=TRUE`)
#' @export

pop_fis <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){
pop_fis <- function(.x, method = c("Nei87", "WG17"), by_locus = FALSE, include_global=FALSE, allele_sharing_mat = NULL){
method <- match.arg(method)
if (method == "Nei87"){
if (!is.null(allele_sharing_mat)){
stop("allele_sharing_mat not relevant for Nei87")
}
pop_fis_nei87(.x, by_locus = by_locus, include_global = include_global)
} else if (method == "WG17"){
if (by_locus){
stop("by_locus not implemented for WG17")
}
pop_fis_wg17(.x, include_global=include_global, allele_sharing_mat = allele_sharing_mat)
}
}

pop_fis_nei87 <- function(.x, by_locus = FALSE, include_global=include_global,
n_cores = bigstatsr::nb_cores()){
stopifnot_diploid(.x)
# get the populations if it is a grouped gen_tibble
if (inherits(.x,"grouped_df")){
.group_levels <- .x %>% group_keys()
.group_ids <- dplyr::group_indices(.x)-1
} else { # create a dummy pop
.group_levels = tibble(population="pop")
.group_ids <- rep(0,nrow(.x))
}

# summarise population frequencies
pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes,
rowInd = .gt_bigsnp_rows(.x),
colInd = .gt_bigsnp_cols(.x),
groupIds = .group_ids,
ngroups = nrow(.group_levels),
ncores = n_cores)
sHo <-pop_freqs_df$het_obs
mHo <- apply(sHo, 1, mean, na.rm = TRUE)
n <-pop_freqs_df$n/2
# sum of squared frequencies
sp2 <- pop_freqs_df$freq_alt^2+pop_freqs_df$freq_ref^2
Hs <- (1 - sp2 - sHo/2/n)
Hs <- n/(n - 1) * Hs
Fis = 1 - sHo/Hs

colnames(Fis)<- .group_levels %>% dplyr::pull(1)
if (by_locus){
if (include_global){
global <- (.x %>% pop_global_stats(by_locus = TRUE, n_cores = n_cores))$Fis
Fis <- cbind(Fis, global)
}
} else {
Fis <- colMeans(Fis, na.rm = TRUE)
if (include_global){
global <- (.x %>% pop_global_stats(by_locus = FALSE, n_cores = n_cores))["Fis"]
names(global) <- "global"
Fis <- c(Fis, global)
}
}
return(Fis)
}


pop_fis_wg17 <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){
if (!inherits(.x,"grouped_df")){
stop (".x should be a grouped df")
stop (".x should be a grouped gen_tibble")
}
if (is.null(allele_sharing_mat)){
allele_sharing_mat <- pairwise_allele_sharing(.x, as_matrix = TRUE)
Expand Down
6 changes: 4 additions & 2 deletions R/pop_fst.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#' Compute population specific Fst
#'
#' This function computes population specific Fst (as computed by [hierfstat::fst.dosage()]).
#' This function computes population specific Fst, using the approach in Weir and Goudet 2017
#' (as computed by [hierfstat::fst.dosage()]).
#' @references Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085
#' @param .x a grouped [`gen_tibble`] (as obtained by using [dplyr::group_by()])
#' @param include_global boolean determining whether, besides the population specific Fst, a global
#' Fst should be appended. Note that this will return a vector of n populations plus 1 (the global value)
Expand All @@ -12,7 +14,7 @@

pop_fst <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){
if (!inherits(.x,"grouped_df")){
stop (".x should be a grouped df")
stop (".x should be a grouped gen_tibble")
}
if (is.null(allele_sharing_mat)){
allele_sharing_mat <- pairwise_allele_sharing(.x, as_matrix = TRUE)
Expand Down
Loading

0 comments on commit 1a19203

Please sign in to comment.