Skip to content

Commit

Permalink
Merge pull request #71 from EvolEcolGroup/reorder_admix_plots
Browse files Browse the repository at this point in the history
Reorder admix plots
  • Loading branch information
dramanica authored Jan 21, 2025
2 parents c254875 + 93af987 commit 47e2dd2
Show file tree
Hide file tree
Showing 21 changed files with 941 additions and 124 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ export(gt_pcadapt)
export(gt_roh_window)
export(gt_save)
export(gt_set_imputed)
export(gt_snmf)
export(gt_update_backingfile)
export(gt_uses_imputed)
export(indiv_het_obs)
Expand Down
2 changes: 1 addition & 1 deletion R/annotate_group_info.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

annotate_group_info <- function(q_tbl){
group <- q_tbl %>% dplyr::distinct(id,group) %>% dplyr::pull(group)
if (length(rle(group)$values)!=length(unique(group))) {
if (length(rle(as.character(group))$values)!=length(unique(group))) {
stop("values in 'group' are not ordered (they should be in consecutive blocks, one per group")
}
# is there a way to get the aesthetic from the plot, so that we can check that
Expand Down
27 changes: 21 additions & 6 deletions R/autoplot_gt_admix.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,18 @@
#' not customisable; we recommend that you use `ggplot2` to produce publication
#' ready plots.
#'
#' This autoplot will automatically rearrange individuals according to
#' their id and any grouping variables if an associated 'data' gen_tibble is provided.
#' To avoid any automatic re-sorting of individuals, set `arrange_by_group` and
#' `arrange_by_indiv` to FALSE. See `autoplot.q_matrix` for further details.
#'
#' @param object an object of class `gt_admixture`
#' @param type the type of plot (one of "cv", and "boxplot")
#' @param type the type of plot (one of "cv", and "barplot")
#' @param k the value of `k` to plot (for `barplot` type only)
#' param repeat the repeat to plot (for `barplot` type only)
#' @param run the run to plot (for `barplot` type only)
#' @param ... additional arguments to be passed to autoplot method for q_matrices [autoplot_q_matrix()].
#' @param ... additional arguments to be passed to autoplot method
#' for q_matrices [autoplot_q_matrix()], used when type is `barplot`.
#' @returns a `ggplot2` object
#' @name autoplot_gt_admix
#' @export
Expand All @@ -28,10 +34,19 @@ autoplot.gt_admix <- function(object,
if (is.null(object$cv)){
stop("No cross validation error available")
}
ggplot2::ggplot(data.frame(k=object$k, cv=object$cv), ggplot2::aes(x=.data$k, y=.data$cv)) +
ggplot2::geom_point() +
ggplot2::geom_line() +
ggplot2::labs(x="k", y="Cross validation error")

if(object$algorithm == "SNMF"){
ggplot2::ggplot(data.frame(k=object$k, cv=object$cv), ggplot2::aes(x=.data$k, y=.data$cv)) +
ggplot2::geom_point() +
ggplot2::geom_line() +
ggplot2::labs(x="k", y="Cross-Entropy")
} else {
ggplot2::ggplot(data.frame(k=object$k, cv=object$cv), ggplot2::aes(x=.data$k, y=.data$cv)) +
ggplot2::geom_point() +
ggplot2::geom_line() +
ggplot2::labs(x="k", y="Cross validation error")
}

} else if (type == "barplot") {
# check that k is specified
if (is.null(k)){
Expand Down
2 changes: 1 addition & 1 deletion R/gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,7 @@ change_duplicated_file_name <- function(file){
}
}
# create new file path
new_file <- paste0(dirname(file), "/", base_name, "_v", version)
new_file <- file.path(dirname(file), paste0(base_name, "_v", version))

return(new_file)
}
Expand Down
3 changes: 2 additions & 1 deletion R/gt_admix_methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,10 @@ gt_admix_reorder_q <- function(x, group = NULL) {
stop("You must provide a group variable if there is no grouping information in the gt_admix object")
}
}

group_meta<- tibble(id = seq(1, length(x$group)), group = x$group)
# sort group meta by group
group_meta <- group_meta[order(group_meta$group),]
group_meta <- group_meta %>% arrange(.data$group)
# reorder the q matrices
x$Q <- lapply(x$Q, function(y) y[group_meta$id,])
# if we have an id element, reorder it
Expand Down
6 changes: 6 additions & 0 deletions R/gt_admixture.R
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,12 @@ gt_admixture <- function (x, k, n_runs = 1, crossval = FALSE, n_cores = 1, seed
conda = conda_env)
}

# check if no .Q files were written and if adm_out contains "Error:"
# stop and print adm_out if both are true
if (length(grep(".Q", list.files(out))) == 0 & length(grep("Error:", adm_out)) > 0){
stop(adm_out)
}

# read the output
output_prefix <- file.path(out, gsub(".bed", "", basename(input_file)))
adm_list$k[index] <- this_k
Expand Down
188 changes: 188 additions & 0 deletions R/gt_snmf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
#' Run SNMF from R in tidypopgen
#'
#' @details This is a wrapper for the function snmf from R package LEA.
#'
#' @param x a `gen_tibble` or a character giving the path to the input geno file
#' @param k an integer giving the number of clusters
#' @param project one of "continue", "new", and "force": "continue" stores files in the current
#' project, "new" creates a new project, and "force" stores results in the current project even
#' if the .geno input file has been altered,
#' @param n_runs the number of runs for each k value (defaults to 1)
#' @param alpha numeric snmf regularization parameter. See LEA::snmf for details
#' @param tolerance numeric value of tolerance (default 0.00001)
#' @param entropy boolean indicating whether to estimate cross-entropy
#' @param percentage numeric value indicating percentage of masked genotypes,
#' ranging between 0 and 1, to be used when entropy = TRUE
#' @param I number of SNPs for initialising the snmf algorithm
#' @param iterations numeric integer for maximum iterations (default 200)
#' @param ploidy the ploidy of the input data (defaults to 2)
#' @param seed the seed for the random number generator
#' @return an object of class `gt_admix` consisting of a list with the following elements:
#' - `k` the number of clusters
#' - `Q` a matrix with the admixture proportions
#' - `P` a matrix with the allele frequencies
#' - `log` a log of the output generated by ADMIXTURE (usually printed on the screen when running from the command line)
#' - `cv` the masked cross-entropy (if `entropy` is TRUE)
#' - `loglik` the log likelihood of the model
#' - `id` the id column of the input `gen_tibble` (if applicable)
#' - `group` the group column of the input `gen_tibble` (if applicable)
#' @export

gt_snmf <- function (x, k, project = "continue", n_runs = 1, alpha, tolerance = 0.00001, entropy = FALSE,
percentage = 0.05, I, iterations = 200, ploidy = 2, seed = -1){

# add seed check again!!!
if (!is.null(seed) && length(seed)!= n_runs){
stop("'seed' should be a vector of length 'n_runs'")
}

# if required install LEA
if (!requireNamespace("LEA", quietly = TRUE)) {
utils::install.packages("LEA")
}

if(inherits(x, "gen_tbl")){
input_file <- gt_as_geno_lea(x)
# expand path to file to be full path
input_file <- normalizePath(input_file)
out_file <- sub(".geno","", input_file)
file_name <- sub(".geno","", basename(input_file))
} else if(inherits(x, "character")){
if (!file.exists(x)) {
stop("The file ", x, " does not exist")
}
# check whether the file ends in .geno
if (!grepl(".geno$", x)) {
stop("The input file must be a .geno file")
}
input_file <- x
out_file <- sub(".geno","", input_file)
file_name <- sub(".geno","", basename(input_file))
} else if(!inherits(x, "character")){
stop("x must be a gen_tibble or a character giving the path to the input geno file")
}

# cast k as an integer
k <- as.integer(k)

# initialise list to store results
adm_list <- list(
k = integer(),
Q = list(),
P = list(),
log = list(),
loglik = numeric(),
G = list()
)
class(adm_list) <- c("gt_admix","list")

if(entropy){
snmf_res <- utils::capture.output(LEA::snmf(input.file = input_file,
K = k,
project = project,
repetitions = n_runs,
alpha = alpha,
tolerance = tolerance,
entropy = entropy,
percentage = percentage,
I = I,
iterations = iterations,
ploidy = ploidy,
seed = seed))
} else {
snmf_res <- utils::capture.output(LEA::snmf(input.file = input_file,
K = k,
project = project,
repetitions = n_runs,
alpha = alpha,
tolerance = tolerance,
I = I,
iterations = iterations,
ploidy = ploidy,
seed = seed))
}

# loop over values of k and number of repeats
index <- 1
for (this_k in as.integer(k)) {
for (this_rep in seq_len(n_runs)) {

adm_list$k[index] <- this_k
adm_list$Q[[index]] <- q_matrix(utils::read.table(paste0(out_file,".snmf/K",this_k,
"/run",this_rep,"/",file_name,
"_r",this_rep,".",this_k,".Q"), header = FALSE))
adm_list$G[[index]] <- q_matrix(utils::read.table(paste0(out_file,".snmf/K",this_k,
"/run",this_rep,"/",file_name,
"_r",this_rep,".",this_k,".G"), header = FALSE))
index <- index + 1
}
}

# add log
adm_list$log <- snmf_res

# add entropy to cv slot
if (entropy) {
# extract value from line with cross-Entropy (number after :)
entropy <- extract_cross_entropy(snmf_res)
adm_list$cv <- entropy$cross_entropy_masked
}

# add metadata if x is a gen_tibble
if (inherits(x, "gen_tbl")) {
adm_list$id <- x$id
# if it is grouped, add the group
if (inherits(x, "grouped_gen_tbl")) {
adm_list$group <- x[[dplyr::group_vars(x)]]
}
}

# add info on algorithm
adm_list$algorithm <- "SNMF"

return(adm_list)
}


# Internal function for extracting cross-entropy from the log output
extract_cross_entropy <- function(log_text) {
# Initialize an empty tibble
results <- tibble(K = integer(),
repetition = integer(),
cross_entropy_all = numeric(),
cross_entropy_masked = numeric())

# Loop through the text and extract data
for (i in seq_along(log_text)) {
line <- log_text[i]

# Match the "sNMF K = x repetition y" line
if (grepl("sNMF K = \\d+ repetition \\d+", line)) {
# Extract K and repetition
matches <- regmatches(line, regexec("sNMF K = (\\d+) repetition (\\d+)", line))
K <- as.integer(matches[[1]][2])
repetition <- as.integer(matches[[1]][3])
}

# Match the "Cross-Entropy (all data):" line
if (grepl("Cross-Entropy \\(all data\\):", line)) {
matches <- regmatches(line, regexec("Cross-Entropy \\(all data\\):\\s+([0-9.]+)", line))
cross_entropy_all <- as.numeric(matches[[1]][2])
}

# Match the "Cross-Entropy (masked data):" line
if (grepl("Cross-Entropy \\(masked data\\):", line)) {
matches <- regmatches(line, regexec("Cross-Entropy \\(masked data\\):\\s+([0-9.]+)", line))
cross_entropy_masked <- as.numeric(matches[[1]][2])

# Add the collected data to the tibble
results <- add_row(results,
K = K,
repetition = repetition,
cross_entropy_all = cross_entropy_all,
cross_entropy_masked = cross_entropy_masked)
}
}

return(results)
}
11 changes: 11 additions & 0 deletions R/is_loci_table_ordered.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,17 @@ is_loci_table_ordered.vctrs_bigSNP <- function(.x, error_on_false = FALSE, ignor
}
}

# check that within each chromosome positions are unique
if (any(unlist(show_loci(.x) %>%
group_by(.data$chr_int) %>%
group_map(~ duplicated(.x$position))))){
if (error_on_false){
stop("Your loci table contains duplicates")
} else {
return(FALSE)
}
}

# check that all positions in a chromosome are adjacent
if(any(duplicated(rle(show_loci(.x)$chr_int)$values))){
if (error_on_false){
Expand Down
Loading

0 comments on commit 47e2dd2

Please sign in to comment.