-
Notifications
You must be signed in to change notification settings - Fork 8
/
mergeGenomicRegions.R
105 lines (89 loc) · 3.86 KB
/
mergeGenomicRegions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env Rscript
# Merge a list of genomic regions and find out complementary regions afterwards.
# Commandline: Rscript mergeGenomicRegions.R [input file] [genome length] [output prefix]
# The input file is a CSV file comprised of two columns ("from" and "to") without a header.
# This script assumes that there are always >= 2 regions in any coordinate tables.
# A typical input is the coordinate file produced by the script filterCoords.py of RedDog.
# Example:
# Rscript mergeGenomicRegions.R coord.txt 5248520
# Outputs:
# (1) [input filename]__merged.csv, (2) [input filename]__comple.csv
# Saved under the current working directory.
#
# Copyright 2017 Yu Wan <wanyuac@gmail.com>
# Licensed under the Apache License, Version 2.0
# Edition: 30 Apr 2017
mergeRegions <- function(x) {
y <- x[1, ] # take the first row of x to start with
j <- 1 # row pointer of y
ub1 <- y$to[1] # upper bound of the current region
for (i in 2 : nrow(x)) { # must guarantee there are >= 2 rows in the table
z <- x[i, ]
lb2 <- z$from[1] # lower bound of the new region
ub2 <- z$to[1]
# There are only two behavious: either merge two regions or adding a separate region.
if (lb2 <= (ub1 + 1)) { # two regions overlap or adjacent: merge them into a single one. lb1 <= lb2 because the data frame is sorted in an ascending order.
if (ub2 > ub1) { # extend the previous region
y$to[j] <- ub2
ub1 <- ub2
} # else, do nothing as the second range is a subset of the first one
} else { # push a new and non-overlapping region into the stack of regions
y <- rbind(y, z)
j <- j + 1 # move the point to the new row
ub1 <- ub2
}
}
return(y)
}
findComplementaryRegions <- function(x, L) { # L: genome size
n <- nrow(x) # number of predefined regions
r <- x[1, ]
lb <- r$from[1]
ub <- r$to[1]
# initialise z
if (lb > 1) { # if the first region is not at the start of the genome
z <- data.frame(from = 1, to = lb - 1) # lb - 1 may equal s
} else {
z <- data.frame(from = integer(0), to = integer(0))
}
s <- ub + 1
for (i in 2 : n) { # must guarantee there are >= 2 rows in the table
r <- x[i, ]
lb <- r$from[1]
ub <- r$to[1]
# Notice the function mergeRegions guarantees that lb > s and lb - s >= 1.
# So the width of any gaps >= 1.
z <- rbind(z, data.frame(from = s, to = lb - 1)) # Notice lb - 1 may equal s
s <- ub + 1
}
# Are there any bases left beyond the last predefined region?
if (s <= L) {
z <- rbind(z, data.frame(from = s, to = L))
}
return(z)
}
# Read arguments
args <- commandArgs(trailingOnly = TRUE)
input <- args[1]
genome.len <- args[2]
prefix <- ifelse(length(args) >= 3, args[3], "coords")
if (file.exists(input)) {
x <- read.csv(input, header = FALSE)
} else {
stop(paste("The input file", input, "is not found.", sep = " "))
}
names(x) <- c("from", "to")
# Sort beginnings of regions so that their heads do not go backwards
x <- x[order(x$from, decreasing = FALSE), ]
# Merge regions
y <- mergeRegions(x)
# Find out complementary regions
z <- findComplementaryRegions(y, genome.len)
# write results
write.table(y, file = paste0(prefix, "__merged.csv"), col.names = FALSE, row.names = FALSE, quote = FALSE, sep = ",") # Merged regions
write.table(z, file = paste0(prefix, "__comple.csv"), col.names = FALSE, row.names = FALSE, quote = FALSE, sep = ",") # Complementary regions
# sumamrise regions
y$base <- y$to - y$from + 1
z$base <- z$to - z$from + 1
print(paste(nrow(x), "regions have been merged into", nrow(y), "regions of", sum(y$base), "bases.", sep = " "))
print(paste("There are", nrow(z), "regions of", sum(z$base), "bases outside of the merged regions.", sep = " "))