Merge pull request #56 from EvolEcolGroup/sum_stats

Sum stats
EvolEcolGroup · Oct 19, 2024 · 1a19203 · 1a19203
2 parents f8610e8 + c71fcf3
commit 1a19203
Show file tree

Hide file tree

Showing 31 changed files with 755 additions and 88 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: tidypopgen
 Title: Tidy Population Genetics
-Version: 0.0.0.9017
+Version: 0.0.0.9018
 Authors@R: 
     c(person("Evie", "Carter", role = c("aut")),
     person("Andrea", "Manica", email = "am315@cam.ac.uk", 

diff --git a/NAMESPACE b/NAMESPACE
@@ -24,13 +24,10 @@ S3method(dplyr_row_slice,grouped_gen_tbl)
 S3method(gen_tibble,character)
 S3method(gen_tibble,matrix)
 S3method(group_by,gen_tbl)
-S3method(indiv_het_obs,grouped_df)
 S3method(indiv_het_obs,tbl_df)
 S3method(indiv_het_obs,vctrs_bigSNP)
-S3method(indiv_missingness,grouped_df)
 S3method(indiv_missingness,tbl_df)
 S3method(indiv_missingness,vctrs_bigSNP)
-S3method(indiv_ploidy,grouped_df)
 S3method(indiv_ploidy,tbl_df)
 S3method(indiv_ploidy,vctrs_bigSNP)
 S3method(loci_alt_freq,grouped_df)
@@ -124,6 +121,10 @@ export(pairwise_king)
 export(pairwise_pop_fst)
 export(pop_fis)
 export(pop_fst)
+export(pop_gene_div)
+export(pop_global_stats)
+export(pop_het_exp)
+export(pop_het_obs)
 export(q_matrix)
 export(qc_report_indiv)
 export(qc_report_loci)

diff --git a/R/gen_tibble.R b/R/gen_tibble.R
@@ -37,6 +37,7 @@
 #' a fast C++ parser, or "vcfR" to use the R package `vcfR`. The latter is slower
 #' but more robust; if "cpp" gives error, try using "vcfR" in case your VCF has
 #' an unusual structure.
+#' @param n_cores the number of cores to use for parallel processing
 #' @param valid_alleles a vector of valid allele values; it defaults to 'A','T',
 #' 'C' and 'G'.
 #' @param missing_alleles a vector of values in the BIM file/loci dataframe that
@@ -76,6 +77,7 @@ gen_tibble.character <-
   function(x,
            ...,
            parser = c("vcfR","cpp"),
+           n_cores = 1,
            chunk_size = NULL,
            valid_alleles = c("A", "T", "C", "G"),
            missing_alleles = c("0","."),
@@ -107,6 +109,7 @@ gen_tibble.character <-
                        quiet = quiet)
   } else if ((tolower(file_ext(x))=="vcf") || (tolower(file_ext(x))=="gz")){
     return(gen_tibble_vcf(x = x, ..., parser = parser, chunk_size = chunk_size,
+                          n_cores = n_cores,
                    valid_alleles= valid_alleles,
                    missing_alleles= missing_alleles,
                    backingfile = backingfile, quiet = quiet))

diff --git a/R/gt_helper_functions.R b/R/gt_helper_functions.R
@@ -10,23 +10,3 @@
 .gt_get_bigsnp<-function(.x){
   attr(.x$genotypes,"bigsnp")
 }
-
-
-# a developer function to create various count summaries of a population, used to
-# compute more complex statistics (e.g. pairwise fst, etc.).
-.gt_pop_freqs <- function(.x){
-  counts <- bigstatsr::big_counts( .gt_get_bigsnp(.x)$genotypes,
-                                   ind.row =.gt_bigsnp_rows(.x),
-                                   ind.col = .gt_bigsnp_cols(.x))
-  sums_alt <- apply(counts,2,function(x) x[2]+2*x[3])
-  n <- apply(counts,2,function(x) sum(x[1:3])*2)
-  sums_ref <- n - sums_alt
-  freq_alt <- sums_alt/n
-  freq_ref <- 1- freq_alt
-  het_obs <- apply(counts,2,function(x) x[2]/sum(x[1:3]))
-  return (list(
-    freq_alt = freq_alt,
-    freq_ref = freq_ref,
-    n = n,
-    het_obs = het_obs))
-}
diff --git a/R/gt_roh_window.R b/R/gt_roh_window.R
@@ -42,8 +42,8 @@
 #' @export
 #'
 #' @examples
-#' # don't run the example
-#' if (FALSE) {
+#' # run the example only if we have the package installed
+#' if (requireNamespace("detectRUNS", quietly = TRUE)) {
 #' sheep_ped <- system.file("extdata", "Kijas2016_Sheep_subset.ped",
 #'     package="detectRUNS")
 #' sheep_gt <- tidypopgen::gen_tibble(sheep_ped, backingfile = tempfile(),

diff --git a/R/indiv_het_obs.R b/R/indiv_het_obs.R
@@ -48,8 +48,8 @@ indiv_het_obs.vctrs_bigSNP <- function(.x, ...){
   this_col_1_na[1,]/(ncol(X)-this_col_1_na[2,])
 }
 
-#' @export
-#' @rdname indiv_het_obs
-indiv_het_obs.grouped_df <- function(.x, ...){
-  .x %>% mutate(indiv_het_obs = indiv_het_obs(.data$genotypes)) %>% summarise(het_obs = mean(indiv_het_obs))
-}
+# #' @export
+# #' @rdname indiv_het_obs
+# indiv_het_obs.grouped_df <- function(.x, ...){
+#   .x %>% mutate(indiv_het_obs = indiv_het_obs(.data$genotypes)) %>% summarise(het_obs = mean(indiv_het_obs))
+# }
diff --git a/R/indiv_missingness.R b/R/indiv_missingness.R
@@ -42,9 +42,9 @@ indiv_missingness.vctrs_bigSNP <- function(.x, as_counts = FALSE, ...){
   row_na
 }
 
-#' @export
-#' @rdname indiv_missingness
-indiv_missingness.grouped_df <- function(.x, as_counts = FALSE, ...){
-  .x %>% mutate(indiv_missingness = indiv_missingness(.data$genotypes, as_counts = as_counts)) %>%
-    summarise(het_obs = mean(indiv_missingness))
-}
+#' #' @export
+#' #' @rdname indiv_missingness
+#' indiv_missingness.grouped_df <- function(.x, as_counts = FALSE, ...){
+#'   .x %>% mutate(indiv_missingness = indiv_missingness(.data$genotypes, as_counts = as_counts)) %>%
+#'     summarise(het_obs = mean(indiv_missingness))
+#' }
diff --git a/R/indiv_ploidy.R b/R/indiv_ploidy.R
@@ -31,9 +31,9 @@ indiv_ploidy.vctrs_bigSNP <- function(.x, ...){
   }
 }
 
-#' @export
-#' @rdname indiv_ploidy
-indiv_ploidy.grouped_df <- function(.x, ...){
-  .x %>% mutate(indiv_ploidy = indiv_ploidy(.data$genotypes)) %>%
-    summarise(ploidy = mean(indiv_ploidy))
-}
+#' #' @export
+#' #' @rdname indiv_ploidy
+#' indiv_ploidy.grouped_df <- function(.x, ...){
+#'   .x %>% mutate(indiv_ploidy = indiv_ploidy(.data$genotypes)) %>%
+#'     summarise(ploidy = mean(indiv_ploidy))
+#' }
diff --git a/R/pop_fis.R b/R/pop_fis.R
@@ -1,19 +1,89 @@
 #' Compute population specific FIS
 #'
-#' This function computes population specific FIS (as computed by [hierfstat::fis.dosage()]).
+#' This function computes population specific FIS, using either the approach of Nei 1987 (as computed by [hierfstat::basic.stats()]) or of Weir and Goudet 2017
+#' (as computed by [hierfstat::fis.dosage()]).
+#' @references
+#' Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press
+#' Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085
 #' @param .x a grouped [`gen_tibble`] (as obtained by using [dplyr::group_by()])
-#' @param include_global boolean determining whether, besides the population specific fis, a global
-#' fis should be appended. Note that this will return a vector of n populations plus 1 (the global value)
-#' @param allele_sharing_mat optional, the matrix of Allele Sharing returned by
+#' @param method one of "Nei87" (based on Nei 1987, eqn 7.41) or "WG17" (for Weir and Goudet 2017) to compute FIS
+#' @param by_locus boolean, determining whether FIS should be returned by locus(TRUE),
+#' or as a single genome wide value (FALSE, the default). Note that this is only relevant for "Nei87",
+#' as "WG17" always returns a single value.
+#' @param include_global boolean determining whether, besides the population specific estiamtes, a global
+#' estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value),
+#' or a matrix with n+1 columns if `by_locus=TRUE`.
+#' @param allele_sharing_mat optional and only relevant for "WG17", the matrix of Allele Sharing returned by
 #' [pairwise_allele_sharing()] with `as_matrix=TRUE`. As a number of statistics can be
 #' derived from the Allele Sharing matrix,
 #' it it sometimes more efficient to pre-compute this matrix.
 #' @returns a vector of population specific fis (plus the global value if `include_global=TRUE`)
 #' @export
 
-pop_fis <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){
+pop_fis <- function(.x, method = c("Nei87", "WG17"), by_locus = FALSE, include_global=FALSE, allele_sharing_mat = NULL){
+  method <- match.arg(method)
+  if (method == "Nei87"){
+    if (!is.null(allele_sharing_mat)){
+      stop("allele_sharing_mat not relevant for Nei87")
+    }
+    pop_fis_nei87(.x, by_locus = by_locus, include_global = include_global)
+  } else if (method == "WG17"){
+    if (by_locus){
+      stop("by_locus not implemented for WG17")
+    }
+    pop_fis_wg17(.x, include_global=include_global, allele_sharing_mat = allele_sharing_mat)
+  }
+}
+
+pop_fis_nei87 <- function(.x, by_locus = FALSE, include_global=include_global,
+                          n_cores = bigstatsr::nb_cores()){
+    stopifnot_diploid(.x)
+    # get the populations if it is a grouped gen_tibble
+    if (inherits(.x,"grouped_df")){
+      .group_levels <- .x %>% group_keys()
+      .group_ids <- dplyr::group_indices(.x)-1
+    } else { # create a dummy pop
+      .group_levels = tibble(population="pop")
+      .group_ids <- rep(0,nrow(.x))
+    }
+
+    # summarise population frequencies
+    pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes,
+                                         rowInd = .gt_bigsnp_rows(.x),
+                                         colInd = .gt_bigsnp_cols(.x),
+                                         groupIds = .group_ids,
+                                         ngroups = nrow(.group_levels),
+                                         ncores = n_cores)
+    sHo <-pop_freqs_df$het_obs
+    mHo <- apply(sHo, 1, mean, na.rm = TRUE)
+    n <-pop_freqs_df$n/2
+    # sum of squared frequencies
+    sp2 <- pop_freqs_df$freq_alt^2+pop_freqs_df$freq_ref^2
+    Hs <- (1 - sp2 - sHo/2/n)
+    Hs <- n/(n - 1) * Hs
+    Fis = 1 - sHo/Hs
+
+    colnames(Fis)<- .group_levels %>% dplyr::pull(1)
+    if (by_locus){
+      if (include_global){
+        global <- (.x %>% pop_global_stats(by_locus = TRUE, n_cores = n_cores))$Fis
+        Fis <- cbind(Fis, global)
+      }
+    } else {
+      Fis <- colMeans(Fis, na.rm = TRUE)
+      if (include_global){
+         global <- (.x %>% pop_global_stats(by_locus = FALSE, n_cores = n_cores))["Fis"]
+         names(global) <- "global"
+         Fis <- c(Fis, global)
+      }
+    }
+    return(Fis)
+}
+
+
+pop_fis_wg17 <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){
   if (!inherits(.x,"grouped_df")){
-    stop (".x should be a grouped df")
+    stop (".x should be a grouped gen_tibble")
   }
   if (is.null(allele_sharing_mat)){
     allele_sharing_mat <- pairwise_allele_sharing(.x, as_matrix = TRUE)

diff --git a/R/pop_fst.R b/R/pop_fst.R
@@ -1,6 +1,8 @@
 #' Compute population specific Fst
 #'
-#' This function computes population specific Fst (as computed by [hierfstat::fst.dosage()]).
+#' This function computes population specific Fst, using the approach in Weir and Goudet 2017
+#' (as computed by [hierfstat::fst.dosage()]).
+#' @references Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085
 #' @param .x a grouped [`gen_tibble`] (as obtained by using [dplyr::group_by()])
 #' @param include_global boolean determining whether, besides the population specific Fst, a global
 #' Fst should be appended. Note that this will return a vector of n populations plus 1 (the global value)
@@ -12,7 +14,7 @@
 
 pop_fst <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){
   if (!inherits(.x,"grouped_df")){
-    stop (".x should be a grouped df")
+    stop (".x should be a grouped gen_tibble")
   }
   if (is.null(allele_sharing_mat)){
     allele_sharing_mat <- pairwise_allele_sharing(.x, as_matrix = TRUE)