Skip to content

Commit

Permalink
add feature in tpr estimation
Browse files Browse the repository at this point in the history
  • Loading branch information
davidebolo1993 committed Nov 14, 2024
1 parent f393e04 commit 82aa524
Showing 1 changed file with 42 additions and 7 deletions.
49 changes: 42 additions & 7 deletions cosigt_smk/workflow/scripts/plottpr.r
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,41 @@ get_region <- function(x) {
}

#Helper function #2
process_file <- function(file_path, clusters) {
process_file <- function(file_path, clusters, distances) {
df <- fread(file_path, header = TRUE)
sample <- basename(dirname(dirname(file_path)))
sample<-unlist(strsplit(sample, ".", fixed=TRUE))[1]
haps <- names(clusters)[grepl(sample, names(clusters))]
true_clusters <- sort(c(clusters[[haps[1]]], clusters[[haps[2]]]))
predicted_clusters <- sort(c(df$cluster.1[1], df$cluster.2[1]))
predicted_haplos <- c(df[[1]][1], df[[2]][1])

if (length(haps) != 2) {
true_clusters<-c("ambiguous", "ambiguous")
} else {
true.cl.1<-ifelse(clusters[[haps[1]]] == "*", "unclustered", clusters[[haps[1]]])
true.cl.2<-ifelse(clusters[[haps[2]]] == "*", "unclustered", clusters[[haps[2]]])
true_clusters<-sort(c(true.cl.1, true.cl.2))
}
pred.cl.1<-ifelse(df$cluster.1[1] == "*", "unclustered", df$cluster.1[1])
pred.cl.2<-ifelse(df$cluster.2[1] == "*", "unclustered", df$cluster.2[1])
predicted_clusters<-sort(c(pred.cl.1, pred.cl.2))
predicted_haplos <- sort(c(df[[1]][1], df[[2]][1]))

list(
#for table/plotting
lenient = identical(true_clusters, predicted_clusters),
strict = length(grep(sample, predicted_haplos)) == 2,
w_lenient = ifelse(identical(true_clusters, predicted_clusters), "", sample),
w_strict = ifelse(length(grep(sample, predicted_haplos)) == 2, "", sample),
g_lenient = ifelse(identical(true_clusters, predicted_clusters), sample, ""),
g_strict = ifelse(length(grep(sample, predicted_haplos)) == 2, sample, "")
g_strict = ifelse(length(grep(sample, predicted_haplos)) == 2, sample, ""),
#for table
sample.id = sample,
hap.1.pred = predicted_haplos[1],
hap.2.pred = predicted_haplos[2],
cl.1.pred = predicted_clusters[1],
cl.2.pred = predicted_clusters[2],
cl.1.true = true_clusters[1],
cl.2.true = true_clusters[2],
cl.1.pred.true.dist = ifelse(true_clusters[1] == "ambiguous" || true_clusters[1] == "unclustered" || predicted_clusters[1] == "unclustered", "-", distances[which(distances$h.group == true_clusters[1]),][[predicted_clusters[1]]]),
cl.2.pred.true.dist = ifelse(true_clusters[2] == "ambiguous" || true_clusters[2] == "unclustered" || predicted_clusters[2] == "unclustered", "-", distances[which(distances$h.group == true_clusters[2]),][[predicted_clusters[2]]])
)
}

Expand All @@ -43,16 +62,31 @@ regions <- unique(sapply(files, get_region))
tpr_list <- lapply(regions, function(r) {
json_file <- file.path(args[2], paste0(r, ".clusters.json"))
clusters <- fromJSON(file = json_file)
dist_file <-file.path(args[2], paste0(r, ".clusters.hapdist.tsv"))
distances<-fread(dist_file)
region_files <- files[grepl(r, files)]
results <- lapply(region_files, process_file, clusters = clusters, distances=distances)

results <- lapply(region_files, process_file, clusters = clusters)
lenient_scores <- sapply(results, `[[`, "lenient")
strict_scores <- sapply(results, `[[`, "strict")
lenient_w_names<-sapply(results, `[[`, "w_lenient")
strict_w_names<-sapply(results, `[[`, "w_strict")
lenient_g_names<-sapply(results, `[[`, "g_lenient")
strict_g_names<-sapply(results, `[[`, "g_strict")

sample<-sapply(results, `[[`, "sample.id")
hap.1.pred<-sapply(results, `[[`, "hap.1.pred")
hap.2.pred<-sapply(results, `[[`, "hap.2.pred")
cl.1.pred<-sapply(results, `[[`, "cl.1.pred")
cl.2.pred<-sapply(results, `[[`, "cl.2.pred")
cl.1.true<-sapply(results, `[[`, "cl.1.true")
cl.2.true<-sapply(results, `[[`, "cl.2.true")
cl.1.pred.true.dist<-sapply(results, `[[`, "cl.1.pred.true.dist")
cl.2.pred.true.dist<-sapply(results, `[[`, "cl.2.pred.true.dist")

btab<-data.frame(sample=sample, hap.1.pred = hap.1.pred, hap.2.pred = hap.2.pred, cl.1.pred = cl.1.pred, cl.2.pred=cl.2.pred, cl.1.true=cl.1.true, cl.2.true=cl.2.true, cl.1.pred.true.dist = cl.1.pred.true.dist, cl.2.pred.true.dist=cl.2.pred.true.dist)
fwrite(btab, gsub("pdf", paste0(r,".wdist.tsv"), args[3]),col.names=T, row.names=F, sep="\t")

data.frame(
category = rep(c("lenient", "strict"), each = 2),
measure = rep(c("tp", "fn"), 2),
Expand All @@ -61,6 +95,7 @@ tpr_list <- lapply(regions, function(r) {
sum(strict_scores), length(region_files) - sum(strict_scores)),
ids=c(paste(lenient_g_names[lenient_g_names!=""],collapse=";"),paste(lenient_w_names[lenient_w_names!=""],collapse=";"),paste(strict_g_names[strict_g_names!=""],collapse=";"),paste(strict_w_names[strict_w_names!=""],collapse=";"))
)

})

tpr_df <- rbindlist(tpr_list)
Expand Down

0 comments on commit 82aa524

Please sign in to comment.