forked from IsabelBirds/Drosophila_Riboviz_work
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Test_mouse_genome.Rmd
292 lines (230 loc) · 10.1 KB
/
Test_mouse_genome.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
---
title: "Test_mouse_genome"
author: "Isabel Birds"
date: "12/3/2020"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(Biostrings)
library(seqinr)
library(GenomicRanges)
library(GenomicFeatures)
library(rtracklayer)
library(bedtoolsr)
library(dplyr)
options(bedtools.path = "/Users/cm13ijb/opt/anaconda3/bin")
set.seed(123)
```
Test of script on mouse genome.
Script requires - genome fasta file and GFF file. Length of flanking region - using approx median UTR length.
```{r user_inputs}
annotation_path <- "1_Raw_data/Mouse_genomes/gencode.vM25.annotation.gff3"
genome_path <- "1_Raw_data/Mouse_genomes/GRCm38.primary_assembly.genome.fa"
flanking_length_nt <- 250
output_path <- "2_Processed_data/Mouse_genomes/Mouse"
```
Using make TxDbFromGFF as better for quickly loading large GFFs.
If also need exons: exons <- exonsBy(txdb,"tx")
```{r load_files}
#import a GFF as a TxDb object.
txdb <- makeTxDbFromGFF(annotation_path, format="gff3")
#Extract the coding regions, UTRs, and exons by transcript
cds <- cdsBy(txdb, "tx",use.names=TRUE)
threeUTR <- threeUTRsByTranscript(txdb,use.names=TRUE)
fiveUTR <- fiveUTRsByTranscript(txdb,use.names=TRUE)
```
Describe UTRs for CDS without them.
This takes a while if no UTRs annotated.
```{r make_buffers}
#5' UTR
#Find CDS w/o 5' UTR
no_fiveUTR_cds <- cds[!names(cds) %in% names(fiveUTR)]
#Add 5'UTR
add_5UTR <- function(no_fiveUTR_cds,flanking_length_nt){
#pull out start of cds
#using to ensure retains all metadata and maps back to transcript.
tmp_5UTR <- no_fiveUTR_cds[1]
#find range for 5UTR,update range of tmp5_UTR
ranges(tmp_5UTR) <- ranges(flank(tmp_5UTR,start = TRUE, both = FALSE, width = flanking_length_nt, use.names = TRUE))
return(tmp_5UTR)
}
no_fiveUTR_UTR <- endoapply(no_fiveUTR_cds,add_5UTR,flanking_length_nt)
#3' UTR
#Find CDS w/o 3' UTR
no_threeUTR_cds <- cds[!names(cds) %in% names(threeUTR)]
#Add 3'UTR
add_3UTR <- function(no_threeUTR_cds,flanking_length_nt){
#pull out end of cds
#using to ensure retains all metadata and maps back to transcript.
tmp_3UTR <- no_threeUTR_cds[length(no_threeUTR_cds)]
#find range for 3UTR, update range of tmp3UTR
ranges(tmp_3UTR) <- ranges(flank(tmp_3UTR,start = FALSE, both = FALSE, width = flanking_length_nt, use.names = TRUE))
return(tmp_3UTR)
}
no_threeUTR_UTR <- endoapply(no_threeUTR_cds,add_3UTR,flanking_length_nt)
```
Extend UTRs of less than flanking length (250nt).
This bit takes a while... should depend on transcriptome size.
```{r extend_buffers}
#Find 5' UTR shorter than flanking length
short_5UTR <- fiveUTR[sum(width(fiveUTR)) < flanking_length_nt]
#function to extend 5'UTR
#ignores intron/exon boundaries currently
resize_short_5UTR <- function(short_5UTR,flanking_length_nt){
#calculate extension needed
extra = flanking_length_nt - sum(width(short_5UTR))
#create extra range at start, combine with first range, update.
#adds to start of first range for +, adds to end of first range for - strand
ranges(short_5UTR[1]) <- ranges(range(flank(short_5UTR[1],start=TRUE,both=FALSE,width=extra),short_5UTR[1]))
return(short_5UTR)
}
extended_short_5UTR <- endoapply(short_5UTR,resize_short_5UTR,flanking_length_nt)
#Find 3' UTR shorter than flanking length
short_3UTR <- threeUTR[sum(width(threeUTR)) < flanking_length_nt]
#function to extend 3'UTR
#ignores intron/exon boundaries currently
resize_short_3UTR <- function(short_3UTR,flanking_length_nt){
#calculate extension needed
extra = flanking_length_nt - sum(width(short_3UTR))
#create extra range at end, combine with last range, update.
#adds to end of last range for +, adds to start of last range for - strand
ranges(short_3UTR[length(short_3UTR)]) <- ranges(range(flank(short_3UTR[length(short_3UTR)],
start=FALSE,both=FALSE,width=extra),short_3UTR[length(short_3UTR)]))
return(short_3UTR)
}
extended_short_3UTR <- endoapply(short_3UTR,resize_short_3UTR,flanking_length_nt)
```
Combine UTR list, check 5', 3' and CDS are same length.
Combine all into one grangeslist.
```{r combine_UTR}
#Remove extended UTR from original three UTR - prevent duplicates.
threeUTR <- threeUTR[!names(threeUTR) %in% names(extended_short_3UTR)]
#Combine into one three UTR list
all_threeUTR <- c(threeUTR,extended_short_3UTR,no_threeUTR_UTR)
#Remove extended UTR from original five UTR - prevent duplicates.
fiveUTR <- fiveUTR[!names(fiveUTR) %in% names(extended_short_5UTR)]
#Combine into one five UTR list
all_fiveUTR <- c(fiveUTR,extended_short_5UTR,no_fiveUTR_UTR)
#CHECK - should have a 3UTR and 5UTR for all CDS, and all flanking length or longer
length(cds) == length(all_threeUTR)
length(cds) == length(all_fiveUTR)
all((sum(width(all_threeUTR)) >= flanking_length_nt))
all((sum(width(all_fiveUTR)) >= flanking_length_nt))
#Clear up env
rm(extended_short_3UTR,threeUTR,extended_short_5UTR,fiveUTR,no_fiveUTR_cds,no_fiveUTR_UTR,
no_threeUTR_cds,no_threeUTR_UTR,short_3UTR,short_5UTR)
```
```{r get_sequences}
#Get rid of characters that will break R - EOF within quoted string error
names(cds) <- gsub("'","_",names(cds))
names(all_fiveUTR) <- gsub("'","_",names(all_fiveUTR))
names(all_threeUTR) <- gsub("'","_",names(all_threeUTR))
#Save cds and UTR as tmp bed files
export.bed(cds,"tmp_cds.bed")
export.bed(all_fiveUTR,"tmp_5UTR.bed")
export.bed(all_threeUTR,"tmp_3UTR.bed")
#Use bedtools to get sequences
#Split - splices together cds
cds_seq <- bedtoolsr::bt.getfasta(fi=genome_path, bed = "tmp_cds.bed",name = TRUE, split = FALSE)
three_seq <- bedtoolsr::bt.getfasta(fi=genome_path,bed = "tmp_3UTR.bed",name =TRUE, split = FALSE)
five_seq <- bedtoolsr::bt.getfasta(fi=genome_path,bed = "tmp_5UTR.bed",name =TRUE, split = FALSE)
#Check have expected no of sequences
length(cds_seq$V1)/2 == length(cds)
length(five_seq$V1)/2 == length(cds)
length(three_seq$V1)/2 == length(cds)
#Tidy IDs, and add quote marks back.
cds_seq$V1 <- gsub(">","",cds_seq$V1)
cds_seq$V1 <- gsub("_","'",cds_seq$V1)
five_seq$V1 <- gsub(">","",five_seq$V1)
five_seq$V1 <- gsub("_","'",five_seq$V1)
three_seq$V1 <- gsub(">","",three_seq$V1)
three_seq$V1 <- gsub("_","'",three_seq$V1)
```
Make new GFF based on fastas.
Three lines in GFF
YAL068C rtracklayer UTR5 1 250 . + . Name=YAL068C
YAL068C rtracklayer CDS 251 613 . + . Name=YAL068C
YAL068C rtracklayer UTR3 614 863 . + . Name=YAL068C
```{r three_prime_UTR}
#Fastas are not read in same order
#Build dataframes for final GFF.
#five UTR
tmp_five_gff <- cbind.data.frame(five_seq$V1[seq(1, nrow(five_seq), 2)],
five_seq$V1[seq(2, nrow(five_seq), 2)])
colnames(tmp_five_gff) <- c("fasta_id","five_UTR")
tmp_five_gff$type <- "five_prime_UTR"
tmp_five_gff$start <- 1
tmp_five_gff$width <- width(as.character(tmp_five_gff$five_UTR))
tmp_five_gff$end <- width(as.character(tmp_five_gff$five_UTR))
#CDS
tmp_cds_gff <- cbind.data.frame(cds_seq$V1[seq(1, nrow(cds_seq), 2)],
cds_seq$V1[seq(2, nrow(cds_seq), 2)])
colnames(tmp_cds_gff) <- c("fasta_id","cds")
tmp_cds_gff$type <- "CDS"
tmp_cds_gff <- left_join(tmp_cds_gff,tmp_five_gff %>% select(fasta_id,end),
by = "fasta_id")
tmp_cds_gff$start <- tmp_cds_gff$end + 1
tmp_cds_gff$width <- width(as.character(tmp_cds_gff$cds))
tmp_cds_gff$end <- tmp_cds_gff$end + width(as.character(tmp_cds_gff$cds))
#three UTR
tmp_three_gff <- cbind.data.frame(three_seq$V1[seq(1, nrow(three_seq), 2)],
three_seq$V1[seq(2, nrow(three_seq), 2)])
colnames(tmp_three_gff) <- c("fasta_id","three_UTR")
tmp_three_gff$type <- "three_prime_UTR"
tmp_three_gff <- left_join(tmp_three_gff,tmp_cds_gff %>% select(fasta_id,end),
by = "fasta_id")
tmp_three_gff$start <- tmp_three_gff$end + 1
tmp_three_gff$width <- width(as.character(tmp_three_gff$three_UTR))
tmp_three_gff$end <- tmp_three_gff$end + width(as.character(tmp_three_gff$three_UTR))
#make gffs
five_gff <- GRanges(seqnames = tmp_five_gff$fasta_id,
ranges = IRanges(start = tmp_five_gff$start,
width = tmp_five_gff$width),
strand = "+",
type = tmp_five_gff$type,
Name = tmp_five_gff$fasta_id)
cds_gff <- GRanges(seqnames = tmp_cds_gff$fasta_id,
ranges = IRanges(start = tmp_cds_gff$start,
width = tmp_cds_gff$width),
strand = "+",
type = tmp_cds_gff$type,
Name = tmp_cds_gff$fasta_id)
three_gff <- GRanges(seqnames = tmp_three_gff$fasta_id,
ranges = IRanges(start = tmp_three_gff$start,
width = tmp_three_gff$width),
strand = "+",
type = tmp_three_gff$type,
Name = tmp_three_gff$fasta_id)
#Combine into final gff
final_gff <- c(five_gff,cds_gff,three_gff)
export.gff3(final_gff,con = paste(output_path,".gff3",sep=""))
```
One sequence per CDS eg
>YAL068C
ACCTATGAAAGATTTATGATTCGTTCAGAAACAAGAGCATCTCCATAGAGATAATGAGATTGTGTGAAAGATGAGATATA
```{r make final fasta}
final_fasta <- select(tmp_five_gff,c("fasta_id","five_UTR"))
final_fasta <- left_join(final_fasta,tmp_cds_gff %>% select(fasta_id,cds),
by = "fasta_id")
final_fasta <- left_join(final_fasta,tmp_three_gff %>% select(fasta_id,three_UTR),
by = "fasta_id")
final_fasta$full_seq <- paste(final_fasta$five_UTR,final_fasta$cds,final_fasta$three_UTR,sep = "")
#using function not mapply to ensure makes a new file
for (i in (1:length(final_fasta$fasta_id))){
if (i == 1){
write.fasta(sequences = final_fasta$full_seq[i],
names = final_fasta$fasta_id[i],
file.out = paste(output_path,".fasta",sep=""),
as.string = TRUE,
open = "w")
} else{
write.fasta(sequences = final_fasta$full_seq[i],
names = final_fasta$fasta_id[i],
file.out = paste(output_path,".fasta",sep=""),
as.string = TRUE,
open = "a")
}
}
```
Then remove tmp files!