-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #71 from EvolEcolGroup/reorder_admix_plots
Reorder admix plots
- Loading branch information
Showing
21 changed files
with
941 additions
and
124 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
#' Run SNMF from R in tidypopgen | ||
#' | ||
#' @details This is a wrapper for the function snmf from R package LEA. | ||
#' | ||
#' @param x a `gen_tibble` or a character giving the path to the input geno file | ||
#' @param k an integer giving the number of clusters | ||
#' @param project one of "continue", "new", and "force": "continue" stores files in the current | ||
#' project, "new" creates a new project, and "force" stores results in the current project even | ||
#' if the .geno input file has been altered, | ||
#' @param n_runs the number of runs for each k value (defaults to 1) | ||
#' @param alpha numeric snmf regularization parameter. See LEA::snmf for details | ||
#' @param tolerance numeric value of tolerance (default 0.00001) | ||
#' @param entropy boolean indicating whether to estimate cross-entropy | ||
#' @param percentage numeric value indicating percentage of masked genotypes, | ||
#' ranging between 0 and 1, to be used when entropy = TRUE | ||
#' @param I number of SNPs for initialising the snmf algorithm | ||
#' @param iterations numeric integer for maximum iterations (default 200) | ||
#' @param ploidy the ploidy of the input data (defaults to 2) | ||
#' @param seed the seed for the random number generator | ||
#' @return an object of class `gt_admix` consisting of a list with the following elements: | ||
#' - `k` the number of clusters | ||
#' - `Q` a matrix with the admixture proportions | ||
#' - `P` a matrix with the allele frequencies | ||
#' - `log` a log of the output generated by ADMIXTURE (usually printed on the screen when running from the command line) | ||
#' - `cv` the masked cross-entropy (if `entropy` is TRUE) | ||
#' - `loglik` the log likelihood of the model | ||
#' - `id` the id column of the input `gen_tibble` (if applicable) | ||
#' - `group` the group column of the input `gen_tibble` (if applicable) | ||
#' @export | ||
|
||
gt_snmf <- function (x, k, project = "continue", n_runs = 1, alpha, tolerance = 0.00001, entropy = FALSE, | ||
percentage = 0.05, I, iterations = 200, ploidy = 2, seed = -1){ | ||
|
||
# add seed check again!!! | ||
if (!is.null(seed) && length(seed)!= n_runs){ | ||
stop("'seed' should be a vector of length 'n_runs'") | ||
} | ||
|
||
# if required install LEA | ||
if (!requireNamespace("LEA", quietly = TRUE)) { | ||
utils::install.packages("LEA") | ||
} | ||
|
||
if(inherits(x, "gen_tbl")){ | ||
input_file <- gt_as_geno_lea(x) | ||
# expand path to file to be full path | ||
input_file <- normalizePath(input_file) | ||
out_file <- sub(".geno","", input_file) | ||
file_name <- sub(".geno","", basename(input_file)) | ||
} else if(inherits(x, "character")){ | ||
if (!file.exists(x)) { | ||
stop("The file ", x, " does not exist") | ||
} | ||
# check whether the file ends in .geno | ||
if (!grepl(".geno$", x)) { | ||
stop("The input file must be a .geno file") | ||
} | ||
input_file <- x | ||
out_file <- sub(".geno","", input_file) | ||
file_name <- sub(".geno","", basename(input_file)) | ||
} else if(!inherits(x, "character")){ | ||
stop("x must be a gen_tibble or a character giving the path to the input geno file") | ||
} | ||
|
||
# cast k as an integer | ||
k <- as.integer(k) | ||
|
||
# initialise list to store results | ||
adm_list <- list( | ||
k = integer(), | ||
Q = list(), | ||
P = list(), | ||
log = list(), | ||
loglik = numeric(), | ||
G = list() | ||
) | ||
class(adm_list) <- c("gt_admix","list") | ||
|
||
if(entropy){ | ||
snmf_res <- utils::capture.output(LEA::snmf(input.file = input_file, | ||
K = k, | ||
project = project, | ||
repetitions = n_runs, | ||
alpha = alpha, | ||
tolerance = tolerance, | ||
entropy = entropy, | ||
percentage = percentage, | ||
I = I, | ||
iterations = iterations, | ||
ploidy = ploidy, | ||
seed = seed)) | ||
} else { | ||
snmf_res <- utils::capture.output(LEA::snmf(input.file = input_file, | ||
K = k, | ||
project = project, | ||
repetitions = n_runs, | ||
alpha = alpha, | ||
tolerance = tolerance, | ||
I = I, | ||
iterations = iterations, | ||
ploidy = ploidy, | ||
seed = seed)) | ||
} | ||
|
||
# loop over values of k and number of repeats | ||
index <- 1 | ||
for (this_k in as.integer(k)) { | ||
for (this_rep in seq_len(n_runs)) { | ||
|
||
adm_list$k[index] <- this_k | ||
adm_list$Q[[index]] <- q_matrix(utils::read.table(paste0(out_file,".snmf/K",this_k, | ||
"/run",this_rep,"/",file_name, | ||
"_r",this_rep,".",this_k,".Q"), header = FALSE)) | ||
adm_list$G[[index]] <- q_matrix(utils::read.table(paste0(out_file,".snmf/K",this_k, | ||
"/run",this_rep,"/",file_name, | ||
"_r",this_rep,".",this_k,".G"), header = FALSE)) | ||
index <- index + 1 | ||
} | ||
} | ||
|
||
# add log | ||
adm_list$log <- snmf_res | ||
|
||
# add entropy to cv slot | ||
if (entropy) { | ||
# extract value from line with cross-Entropy (number after :) | ||
entropy <- extract_cross_entropy(snmf_res) | ||
adm_list$cv <- entropy$cross_entropy_masked | ||
} | ||
|
||
# add metadata if x is a gen_tibble | ||
if (inherits(x, "gen_tbl")) { | ||
adm_list$id <- x$id | ||
# if it is grouped, add the group | ||
if (inherits(x, "grouped_gen_tbl")) { | ||
adm_list$group <- x[[dplyr::group_vars(x)]] | ||
} | ||
} | ||
|
||
# add info on algorithm | ||
adm_list$algorithm <- "SNMF" | ||
|
||
return(adm_list) | ||
} | ||
|
||
|
||
# Internal function for extracting cross-entropy from the log output | ||
extract_cross_entropy <- function(log_text) { | ||
# Initialize an empty tibble | ||
results <- tibble(K = integer(), | ||
repetition = integer(), | ||
cross_entropy_all = numeric(), | ||
cross_entropy_masked = numeric()) | ||
|
||
# Loop through the text and extract data | ||
for (i in seq_along(log_text)) { | ||
line <- log_text[i] | ||
|
||
# Match the "sNMF K = x repetition y" line | ||
if (grepl("sNMF K = \\d+ repetition \\d+", line)) { | ||
# Extract K and repetition | ||
matches <- regmatches(line, regexec("sNMF K = (\\d+) repetition (\\d+)", line)) | ||
K <- as.integer(matches[[1]][2]) | ||
repetition <- as.integer(matches[[1]][3]) | ||
} | ||
|
||
# Match the "Cross-Entropy (all data):" line | ||
if (grepl("Cross-Entropy \\(all data\\):", line)) { | ||
matches <- regmatches(line, regexec("Cross-Entropy \\(all data\\):\\s+([0-9.]+)", line)) | ||
cross_entropy_all <- as.numeric(matches[[1]][2]) | ||
} | ||
|
||
# Match the "Cross-Entropy (masked data):" line | ||
if (grepl("Cross-Entropy \\(masked data\\):", line)) { | ||
matches <- regmatches(line, regexec("Cross-Entropy \\(masked data\\):\\s+([0-9.]+)", line)) | ||
cross_entropy_masked <- as.numeric(matches[[1]][2]) | ||
|
||
# Add the collected data to the tibble | ||
results <- add_row(results, | ||
K = K, | ||
repetition = repetition, | ||
cross_entropy_all = cross_entropy_all, | ||
cross_entropy_masked = cross_entropy_masked) | ||
} | ||
} | ||
|
||
return(results) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.