-
Notifications
You must be signed in to change notification settings - Fork 2
/
create_reference_junctions.R
196 lines (188 loc) · 7.66 KB
/
create_reference_junctions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#' Load short read sjb data
#'
#' @param sjbPath
#' @importFrom dplyr %>%
#' @importFrom dplyr mutate
#' @importFrom dplyr filter
#' @importFrom rlang .data
#' @return
#' @export
#'
#' @examples
read_sjout <- function(sjbPath) {
junctionFile <- read.table(sjbPath)
colnames(junctionFile) <-
c(
"seqnames",
"start",
"end",
"strand",
"intronMotif",
"comp.ref",
"read.count",
"multimappers",
"overhang"
)
junctionFile <- junctionFile %>%
dplyr::mutate(
juncID = paste0(.data$seqnames, ":", .data$start, "-", .data$end),
strand = dplyr::case_when(
.data$strand == 0 ~ "*",
.data$strand == 1 ~ "+",
TRUE ~ "-"
)
)
return(junctionFile)
}
#' Extract junctions from annotation
#' Create junctions from annotation
#' @param refAnnot: path to refrence annotation
#' @importFrom rlang .data
#' @return
#' @export
#'
#' @examples
make_annotation_txf <- function(refAnnot){
txdb <- GenomicFeatures::makeTxDbFromGRanges(refAnnot)
annot_junctions <- SGSeq::convertToTxFeatures(txdb)
annot_junctions <- annot_junctions[SGSeq::type(annot_junctions) == "J"]
GenomicRanges::start(annot_junctions) <- GenomicRanges::start(annot_junctions) + 1
GenomicRanges::end(annot_junctions) <- GenomicRanges::end(annot_junctions) - 1
return(annot_junctions)
}
#' Annotate junctions
#'
#' @param AnnotJunctionsL: junction object from make_annotation_txf
#' @param linksDatabase: database to identify TSS and TES to filter exons
#' @param genesMultipleJunctions: object of genes with multiple isoforms
#' @importFrom rlang .data
#' @return
#' @export
#'
#' @examples
get_junctions_from_annot <- function(AnnotJunctions,linksDatabase,genesMultipleJunctions){
# overlap junctions to windows to filter
message("Removing junctions overlapping TSS and 3'end windows")
junctionsInPromoters <- GenomicRanges::findOverlaps(AnnotJunctions, linksDatabase$promoterDependentWindow)
juncNotIn5prime <- AnnotJunctions[-S4Vectors::queryHits(junctionsInPromoters)]
junctionsIn3ends <- GenomicRanges::findOverlaps(AnnotJunctions, linksDatabase$tesDependentWindow)
juncNotIn3prime <- AnnotJunctions[-S4Vectors::queryHits(junctionsIn3ends)]
junctionsToWork <- unique( c(juncNotIn5prime,juncNotIn3prime))
# keep only genes with more than one junction combinations
message("Removing genes with more")
# remove single junction genes
message("Removing single CDS isoform genes")
junctionsToWork <- junctionsToWork %>%
as.data.frame(.) %>% tidyr::unnest(., geneName) %>%
dplyr::filter(geneName %in% genesMultipleJunctions$geneName) %>%
GenomicRanges::makeGRangesFromDataFrame(., keep.extra.columns = TRUE)
junctionsToWork <- unique(junctionsToWork)
junctionsToWork$txName <- NULL
junctionsToWork$juncID <-
paste0( GenomicRanges::seqnames(junctionsToWork),
":",
GenomicRanges::start(junctionsToWork),
"-",
GenomicRanges::end(junctionsToWork))
GenomicRanges::mcols(junctionsToWork) <- GenomicRanges::mcols(junctionsToWork)[c(2,3)]
junctionRef <- junctionsToWork
junctionRef$gene_id <- junctionRef$geneName
junctionRef$geneName <- NULL
message("Reference junctions ready")
return(junctionRef)
}
#' Retreive junctions from short read sequencing
#'
#' @param pathSJ.out : path to STAR short read junctions file
#' @param min.counts: minimal counts to filter junctions
#' @param refAnnot: reference annotation in genomicRanges format
#' @param linksDatabase: 5'/3' reference annotation
#' @param genesMultipleJunctions: object with genes identified with their junctions
#'
#' @return
#' @export
#'
#' @examples
get_junctions_from_short_reads <- function(pathSJ.out, min.counts,refAnnot,linksDatabase,genesMultipleJunctions){
message("Load splice junction files")
junctionCounts <- read_sjout(pathSJ.out)
junctionCounts <- junctionCounts %>%
dplyr::filter(.data$read.count > min.counts) %>%
dplyr::select(1, 2, 3, 4, 7, 10) %>%
GenomicRanges::makeGRangesFromDataFrame(., keep.extra.columns = TRUE)
# overlap junctions to windows to filter
junctionsInPromoters <- GenomicRanges::findOverlaps(junctionCounts, linksDatabase$promoterDependentWindow)
juncNotIn5prime <- junctionCounts[-S4Vectors::queryHits(junctionsInPromoters)]
junctionsIn3ends <- GenomicRanges::findOverlaps(junctionCounts, linksDatabase$tesDependentWindow)
juncNotIn3prime <- junctionCounts[-S4Vectors::queryHits(junctionsIn3ends)]
shortReadJunctions <- unique( c(juncNotIn5prime,juncNotIn3prime))
# annotate junctions to genes
geneBins <- unique(unlist( range( split(refAnnot, f=refAnnot$gene_id) )))
hits2Genes <- GenomicRanges::findOverlaps( shortReadJunctions, geneBins, type = "within")
shortReadJunctions <- shortReadJunctions[S4Vectors::queryHits(hits2Genes),]
shortReadJunctions$gene_id <- names(geneBins[S4Vectors::subjectHits(hits2Genes),])
GenomicRanges::mcols(shortReadJunctions)<- GenomicRanges::mcols(shortReadJunctions)[c("gene_id", "juncID")]
shortReadJunctions <- unique(shortReadJunctions)
# keep only genes with more than one junction combinations
message("single isoform genes")
shortReadJunctions <- shortReadJunctions[shortReadJunctions$gene_id %in% genesMultipleJunctions$geneName,]
return(shortReadJunctions)
}
#' Title
#'
#' @param pathSJ.out: object of junctions from STAR
#' @param min.jcounts : minimal counts for junction filtering
#' @param refAnnot : reference annotation genomic ranges object
#' @param type: specicify type if want to use short read correction.
#'
#' @return
#' @export
#'
#' @examples
create_reference_junctions <- function(pathSJ.out, min.jcounts, refAnnot, type){
message("Extracting junctions from annotation")
AnnotJunctions <- make_annotation_txf(refAnnot)
message("Preparing Annotation of 5'and 3' ends")
linksDatabase <- prepareLinksDatabase(refAnnot,
tss.window = 50,
tes.window = 150)
# window of exons dependent on ATSS usage
linksDatabase$promoterDependentWindow <-
unlist( range(
split(
linksDatabase$TSSCoordinate.base,
f = linksDatabase$TSSCoordinate.base$value.group_name
)
))
# window of exons dependent on alternative 3'end usage
linksDatabase$tesDependentWindow <-
unlist( range(
split(
linksDatabase$TESCoordinate.base,
f = linksDatabase$TESCoordinate.base$value.group_name
)
))
# get genes with more 1 different junction
genesMultipleJunctions <- AnnotJunctions %>% as.data.frame(.) %>%
tidyr::unnest(., txName) %>% tidyr::unnest(., geneName) %>%
mutate(JunID = paste0(seqnames, start, end)) %>%
dplyr::group_by(txName) %>%
dplyr::mutate(junName = paste0(JunID, collapse = "|")) %>%
dplyr::distinct(junName, .keep_all = TRUE) %>%
dplyr::group_by(geneName) %>% dplyr::tally() %>% dplyr::filter(n>1)
if(type=="short"){
message("Extracting junctions from short reads")
shortReadJunctions <- get_junctions_from_short_reads(pathSJ.out,
min.counts=min.jcounts,
refAnnot,
linksDatabase,
genesMultipleJunctions)
message("Extracting junctions from reference annotation")
annotationJunctions <- get_junctions_from_annot(AnnotJunctions, linksDatabase,genesMultipleJunctions)
reference_junctions <- c(shortReadJunctions, annotationJunctions)
return(reference_junctions)
}else{
annotation_junctions <- get_junctions_from_annot(AnnotJunctions, linksDatabase, genesMultipleJunctions)
return(annotation_junctions)
}
}