Merge pull request #71 from EvolEcolGroup/reorder_admix_plots

Reorder admix plots
EvolEcolGroup · Jan 21, 2025 · 47e2dd2 · 47e2dd2
2 parents c254875 + 93af987
commit 47e2dd2
Show file tree

Hide file tree

Showing 21 changed files with 941 additions and 124 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -118,6 +118,7 @@ export(gt_pcadapt)
 export(gt_roh_window)
 export(gt_save)
 export(gt_set_imputed)
+export(gt_snmf)
 export(gt_update_backingfile)
 export(gt_uses_imputed)
 export(indiv_het_obs)

diff --git a/R/annotate_group_info.R b/R/annotate_group_info.R
@@ -10,7 +10,7 @@
 
 annotate_group_info <- function(q_tbl){
   group <- q_tbl %>% dplyr::distinct(id,group) %>% dplyr::pull(group)
-  if (length(rle(group)$values)!=length(unique(group))) {
+  if (length(rle(as.character(group))$values)!=length(unique(group))) {
     stop("values in 'group' are not ordered (they should be in consecutive blocks, one per group")
   }
 # is there a way to get the aesthetic from the plot, so that we can check that

diff --git a/R/autoplot_gt_admix.R b/R/autoplot_gt_admix.R
@@ -8,12 +8,18 @@
 #' not customisable; we recommend that you use `ggplot2` to produce publication
 #' ready plots.
 #'
+#' This autoplot will automatically rearrange individuals according to
+#' their id and any grouping variables if an associated 'data' gen_tibble is provided.
+#' To avoid any automatic re-sorting of individuals, set `arrange_by_group` and
+#' `arrange_by_indiv` to FALSE. See `autoplot.q_matrix` for further details.
+#'
 #' @param object an object of class `gt_admixture`
-#' @param type the type of plot (one of "cv", and "boxplot")
+#' @param type the type of plot (one of "cv", and "barplot")
 #' @param k the value of `k` to plot (for `barplot` type only)
 #' param repeat the repeat to plot (for `barplot` type only)
 #' @param run the run to plot (for `barplot` type only)
-#' @param ... additional arguments to be passed to autoplot method for q_matrices [autoplot_q_matrix()].
+#' @param ... additional arguments to be passed to autoplot method
+#' for q_matrices [autoplot_q_matrix()], used when type is `barplot`.
 #' @returns a `ggplot2` object
 #' @name autoplot_gt_admix
 #' @export
@@ -28,10 +34,19 @@ autoplot.gt_admix <- function(object,
     if (is.null(object$cv)){
       stop("No cross validation error available")
     }
-    ggplot2::ggplot(data.frame(k=object$k, cv=object$cv), ggplot2::aes(x=.data$k, y=.data$cv)) +
-      ggplot2::geom_point() +
-      ggplot2::geom_line() +
-      ggplot2::labs(x="k", y="Cross validation error")
+
+    if(object$algorithm == "SNMF"){
+      ggplot2::ggplot(data.frame(k=object$k, cv=object$cv), ggplot2::aes(x=.data$k, y=.data$cv)) +
+        ggplot2::geom_point() +
+        ggplot2::geom_line() +
+        ggplot2::labs(x="k", y="Cross-Entropy")
+    } else {
+      ggplot2::ggplot(data.frame(k=object$k, cv=object$cv), ggplot2::aes(x=.data$k, y=.data$cv)) +
+        ggplot2::geom_point() +
+        ggplot2::geom_line() +
+        ggplot2::labs(x="k", y="Cross validation error")
+    }
+
   } else if (type == "barplot") {
     # check that k is specified
     if (is.null(k)){

diff --git a/R/gen_tibble.R b/R/gen_tibble.R
@@ -543,7 +543,7 @@ change_duplicated_file_name <- function(file){
       }
     }
     # create new file path
-    new_file <- paste0(dirname(file), "/", base_name, "_v", version)
+    new_file <- file.path(dirname(file), paste0(base_name, "_v", version))
 
     return(new_file)
   }

diff --git a/R/gt_admix_methods.R b/R/gt_admix_methods.R
@@ -113,9 +113,10 @@ gt_admix_reorder_q <- function(x, group = NULL) {
       stop("You must provide a group variable if there is no grouping information in the gt_admix object")
     }
   }
+
   group_meta<- tibble(id = seq(1, length(x$group)), group = x$group)
   # sort group meta by group
-  group_meta <- group_meta[order(group_meta$group),]
+  group_meta <- group_meta %>% arrange(.data$group)
   # reorder the q matrices
   x$Q <- lapply(x$Q, function(y) y[group_meta$id,])
   # if we have an id element, reorder it

diff --git a/R/gt_admixture.R b/R/gt_admixture.R
@@ -143,6 +143,12 @@ gt_admixture <- function (x, k, n_runs = 1, crossval = FALSE, n_cores = 1, seed
                                conda = conda_env)
       }
 
+      # check if no .Q files were written and if adm_out contains "Error:"
+      # stop and print adm_out if both are true
+      if (length(grep(".Q", list.files(out))) == 0 & length(grep("Error:", adm_out)) > 0){
+        stop(adm_out)
+      }
+
       # read the output
       output_prefix <- file.path(out, gsub(".bed", "", basename(input_file)))
       adm_list$k[index] <- this_k

diff --git a/R/gt_snmf.R b/R/gt_snmf.R
@@ -0,0 +1,188 @@
+#' Run SNMF from R in tidypopgen
+#'
+#' @details This is a wrapper for the function snmf from R package LEA.
+#'
+#' @param x a `gen_tibble` or a character giving the path to the input geno file
+#' @param k an integer giving the number of clusters
+#' @param project one of "continue", "new", and "force": "continue" stores files in the current
+#' project, "new" creates a new project, and "force" stores results in the current project even
+#' if the .geno input file has been altered,
+#' @param n_runs the number of runs for each k value (defaults to 1)
+#' @param alpha numeric snmf regularization parameter. See LEA::snmf for details
+#' @param tolerance numeric value of tolerance (default 0.00001)
+#' @param entropy boolean indicating whether to estimate cross-entropy
+#' @param percentage numeric value indicating percentage of masked genotypes,
+#' ranging between 0 and 1, to be used when entropy = TRUE
+#' @param I number of SNPs for initialising the snmf algorithm
+#' @param iterations numeric integer for maximum iterations (default 200)
+#' @param ploidy the ploidy of the input data (defaults to 2)
+#' @param seed the seed for the random number generator
+#' @return an object of class `gt_admix` consisting of a list with the following elements:
+#' - `k` the number of clusters
+#' - `Q` a matrix with the admixture proportions
+#' - `P` a matrix with the allele frequencies
+#' - `log` a log of the output generated by ADMIXTURE (usually printed on the screen when running from the command line)
+#' - `cv` the masked cross-entropy (if `entropy` is TRUE)
+#' - `loglik` the log likelihood of the model
+#' - `id` the id column of the input `gen_tibble` (if applicable)
+#' - `group` the group column of the input `gen_tibble` (if applicable)
+#' @export
+
+gt_snmf <- function (x, k, project = "continue", n_runs = 1, alpha, tolerance = 0.00001, entropy = FALSE,
+                     percentage = 0.05, I, iterations = 200, ploidy = 2, seed = -1){
+
+  # add seed check again!!!
+  if (!is.null(seed) && length(seed)!= n_runs){
+    stop("'seed' should be a vector of length 'n_runs'")
+  }
+
+  # if required install LEA
+  if (!requireNamespace("LEA", quietly = TRUE)) {
+    utils::install.packages("LEA")
+  }
+
+  if(inherits(x, "gen_tbl")){
+    input_file <- gt_as_geno_lea(x)
+    # expand path to file to be full path
+    input_file <- normalizePath(input_file)
+    out_file <- sub(".geno","", input_file)
+    file_name <- sub(".geno","", basename(input_file))
+  } else if(inherits(x, "character")){
+    if (!file.exists(x)) {
+      stop("The file ", x, " does not exist")
+    }
+    # check whether the file ends in .geno
+    if (!grepl(".geno$", x)) {
+      stop("The input file must be a .geno file")
+    }
+    input_file <- x
+    out_file <- sub(".geno","", input_file)
+    file_name <- sub(".geno","", basename(input_file))
+  } else if(!inherits(x, "character")){
+    stop("x must be a gen_tibble or a character giving the path to the input geno file")
+  }
+
+  # cast k as an integer
+  k <- as.integer(k)
+
+  # initialise list to store results
+  adm_list <- list(
+    k = integer(),
+    Q = list(),
+    P = list(),
+    log = list(),
+    loglik = numeric(),
+    G = list()
+  )
+  class(adm_list) <- c("gt_admix","list")
+
+  if(entropy){
+    snmf_res <- utils::capture.output(LEA::snmf(input.file = input_file,
+                                                K = k,
+                                                project = project,
+                                                repetitions = n_runs,
+                                                alpha = alpha,
+                                                tolerance = tolerance,
+                                                entropy = entropy,
+                                                percentage = percentage,
+                                                I = I,
+                                                iterations = iterations,
+                                                ploidy = ploidy,
+                                                seed = seed))
+  } else {
+    snmf_res <- utils::capture.output(LEA::snmf(input.file = input_file,
+                                                K = k,
+                                                project = project,
+                                                repetitions = n_runs,
+                                                alpha = alpha,
+                                                tolerance = tolerance,
+                                                I = I,
+                                                iterations = iterations,
+                                                ploidy = ploidy,
+                                                seed = seed))
+  }
+
+  # loop over values of k and number of repeats
+  index <- 1
+  for (this_k in as.integer(k)) {
+    for (this_rep in seq_len(n_runs)) {
+
+      adm_list$k[index] <- this_k
+      adm_list$Q[[index]] <- q_matrix(utils::read.table(paste0(out_file,".snmf/K",this_k,
+                                                               "/run",this_rep,"/",file_name,
+                                                               "_r",this_rep,".",this_k,".Q"), header = FALSE))
+      adm_list$G[[index]] <- q_matrix(utils::read.table(paste0(out_file,".snmf/K",this_k,
+                                                               "/run",this_rep,"/",file_name,
+                                                               "_r",this_rep,".",this_k,".G"), header = FALSE))
+     index <- index + 1
+    }
+  }
+
+  # add log
+  adm_list$log <- snmf_res
+
+  # add entropy to cv slot
+  if (entropy) {
+    # extract value from line with cross-Entropy (number after :)
+    entropy <- extract_cross_entropy(snmf_res)
+    adm_list$cv <- entropy$cross_entropy_masked
+  }
+
+  # add metadata if x is a gen_tibble
+  if (inherits(x, "gen_tbl")) {
+    adm_list$id <- x$id
+    # if it is grouped, add the group
+    if (inherits(x, "grouped_gen_tbl")) {
+      adm_list$group <- x[[dplyr::group_vars(x)]]
+    }
+  }
+
+  # add info on algorithm
+  adm_list$algorithm <- "SNMF"
+
+  return(adm_list)
+}
+
+
+# Internal function for extracting cross-entropy from the log output
+extract_cross_entropy <- function(log_text) {
+  # Initialize an empty tibble
+  results <- tibble(K = integer(),
+                    repetition = integer(),
+                    cross_entropy_all = numeric(),
+                    cross_entropy_masked = numeric())
+
+  # Loop through the text and extract data
+  for (i in seq_along(log_text)) {
+    line <- log_text[i]
+
+    # Match the "sNMF K = x repetition y" line
+    if (grepl("sNMF K = \\d+  repetition \\d+", line)) {
+      # Extract K and repetition
+      matches <- regmatches(line, regexec("sNMF K = (\\d+)  repetition (\\d+)", line))
+      K <- as.integer(matches[[1]][2])
+      repetition <- as.integer(matches[[1]][3])
+    }
+
+    # Match the "Cross-Entropy (all data):" line
+    if (grepl("Cross-Entropy \\(all data\\):", line)) {
+      matches <- regmatches(line, regexec("Cross-Entropy \\(all data\\):\\s+([0-9.]+)", line))
+      cross_entropy_all <- as.numeric(matches[[1]][2])
+    }
+
+    # Match the "Cross-Entropy (masked data):" line
+    if (grepl("Cross-Entropy \\(masked data\\):", line)) {
+      matches <- regmatches(line, regexec("Cross-Entropy \\(masked data\\):\\s+([0-9.]+)", line))
+      cross_entropy_masked <- as.numeric(matches[[1]][2])
+
+      # Add the collected data to the tibble
+      results <- add_row(results,
+                         K = K,
+                         repetition = repetition,
+                         cross_entropy_all = cross_entropy_all,
+                         cross_entropy_masked = cross_entropy_masked)
+    }
+  }
+
+  return(results)
+}
diff --git a/R/is_loci_table_ordered.R b/R/is_loci_table_ordered.R
@@ -43,6 +43,17 @@ is_loci_table_ordered.vctrs_bigSNP <- function(.x, error_on_false = FALSE, ignor
     }
   }
 
+  # check that within each chromosome positions are unique
+  if (any(unlist(show_loci(.x) %>%
+                 group_by(.data$chr_int) %>%
+                 group_map(~ duplicated(.x$position))))){
+    if (error_on_false){
+      stop("Your loci table contains duplicates")
+    } else {
+      return(FALSE)
+    }
+  }
+
   # check that all positions in a chromosome are adjacent
   if(any(duplicated(rle(show_loci(.x)$chr_int)$values))){
     if (error_on_false){