-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathNchart_script.R
98 lines (63 loc) · 3.33 KB
/
Nchart_script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
library(ggplot2)
library(scales)
# Edit these inputs:
setwd("~/Desktop/ENTEx/assembly_lengths")
# output prefix
prefix <-"~/Desktop/ENTEx/assembly_lengths/ENC2"
# All files must be tab-delimited with the chromosome/contig lengths in the second column
# These can be generated by running fastalengths, for instance:
# fastalengths my_reference.fasta > my_reference.lengths
# Reference filename
filename_ref <- "GRCh38.lengths"
# Query filenames and names for the plot
query_filenames <- c("falcon_contigs.lengths","megahit_contigs.lengths", "supernova_scaffolds_1.lengths", "supernova_contigs_1.lengths")
query_names <- c("FALCON", "Megahit contigs", "Supernova scaffolds", "Supernova contigs")
# first color is the reference, the others are the queries in order
colors <- c("blue","purple","green","coral","orange")
#############################################################################
ref.data <- read.csv(filename_ref, sep="\t", quote='',header=FALSE)
names(ref.data) <- c("name","length")
ref.data$length <- sort(as.numeric(ref.data$length),decreasing=TRUE)
genome.length <- max(sum(ref.data$length))
ref.cumsum <- data.frame(NG=cumsum(ref.data$length/genome.length*100),contig.length=ref.data$length,contig.source="Reference")
ref.cumsum.0 <- rbind(data.frame(NG=c(0),contig.length=max(ref.cumsum$contig.length),contig.source="Reference"),ref.cumsum)
for (i in seq(length(query_filenames))) {
filename_query = query_filenames[i]
name_query = query_names[i]
print(paste(filename_query,name_query))
query.data <- read.csv(filename_query, sep="\t", quote='',header=FALSE)
names(query.data) <- c("name","length")
query.data$length <- sort(as.numeric(query.data$length),decreasing=TRUE)
query.cumsum <- data.frame(NG=cumsum(query.data$length/genome.length*100),contig.length=query.data$length,contig.source=name_query)
query.cumsum.0 <- rbind(data.frame(NG=c(0),contig.length=max(query.cumsum$contig.length),contig.source=name_query),query.cumsum)
ref.cumsum <- rbind(ref.cumsum,query.cumsum)
ref.cumsum.0 <- rbind(ref.cumsum.0,query.cumsum.0)
}
bp_format<-function(num) {
if (num > 1000000000) {
paste(formatC(num/1000000000,format="f",digits=1,big.mark=",",drop0trailing = TRUE)," Gbp",sep="")
}
else if (num > 1000000) {
paste(formatC(num/1000000,format="f",digits=1,big.mark=",",drop0trailing = TRUE)," Mbp",sep="")
}
else {
paste(formatC(num,format="f",big.mark=",",drop0trailing = TRUE), " bp", sep="")
}
}
theme_set(theme_bw(base_size = 12) + theme(panel.grid.minor = element_line(colour = NA)))
percent_format <- function(num) {
paste(num,"%", sep="")
}
png(file=paste(prefix,".Assemblytics.Nchart.png",sep=""),width=1600,height=1000,res=200)
print(
ggplot(ref.cumsum.0, aes(x = NG, y = contig.length, color=contig.source)) +
# xlim(0,100) +
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)), limits=c(1,genome.length)) +
scale_x_continuous(labels=percent_format) +
geom_path(size=1.5,alpha=0.5) +
geom_point(data=ref.cumsum, size=2,alpha=0.5) +
labs(x = paste("Percentage of reference (",bp_format(genome.length),")", sep=""),y="Sequence length",colour="Assembly",title="Cumulative sequence length") +
scale_color_manual(values=colors) +
annotation_logticks(sides="lr")
)
dev.off()