Riboviz_work/Test_mouse_genome.Rmd

---
title: "Test_mouse_genome"
author: "Isabel Birds"
date: "12/3/2020"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(Biostrings)
library(seqinr)
library(GenomicRanges) 
library(GenomicFeatures)
library(rtracklayer)
library(bedtoolsr)
library(dplyr)
options(bedtools.path = "/Users/cm13ijb/opt/anaconda3/bin")
set.seed(123)
```

Test of script on mouse genome.

Script requires - genome fasta file and GFF file. Length of flanking region - using approx median UTR length.


```{r user_inputs}
annotation_path <- "1_Raw_data/Mouse_genomes/gencode.vM25.annotation.gff3"
genome_path <- "1_Raw_data/Mouse_genomes/GRCm38.primary_assembly.genome.fa"
flanking_length_nt <- 250
output_path <- "2_Processed_data/Mouse_genomes/Mouse"
```

Using make TxDbFromGFF as better for quickly loading large GFFs.
If also need exons: exons <- exonsBy(txdb,"tx")

```{r load_files} 

#import a GFF as a TxDb object.
txdb <- makeTxDbFromGFF(annotation_path, format="gff3")

#Extract the coding regions, UTRs, and exons by transcript
cds <- cdsBy(txdb, "tx",use.names=TRUE)
threeUTR <-  threeUTRsByTranscript(txdb,use.names=TRUE)
fiveUTR <- fiveUTRsByTranscript(txdb,use.names=TRUE)

```

Describe UTRs for CDS without them.
This takes a while if no UTRs annotated.

```{r make_buffers}

#5' UTR
#Find CDS w/o 5' UTR
no_fiveUTR_cds <- cds[!names(cds) %in% names(fiveUTR)]

#Add 5'UTR
add_5UTR <- function(no_fiveUTR_cds,flanking_length_nt){
  #pull out start of cds
  #using to ensure retains all metadata and maps back to transcript.
  tmp_5UTR <- no_fiveUTR_cds[1]
  #find range for 5UTR,update range of tmp5_UTR
  ranges(tmp_5UTR) <- ranges(flank(tmp_5UTR,start = TRUE, both = FALSE, width = flanking_length_nt, use.names = TRUE))
  return(tmp_5UTR)
}

no_fiveUTR_UTR <- endoapply(no_fiveUTR_cds,add_5UTR,flanking_length_nt)

#3' UTR
#Find CDS w/o 3' UTR
no_threeUTR_cds <- cds[!names(cds) %in% names(threeUTR)]

#Add 3'UTR
add_3UTR <- function(no_threeUTR_cds,flanking_length_nt){
  #pull out end of cds
  #using to ensure retains all metadata and maps back to transcript.
  tmp_3UTR <- no_threeUTR_cds[length(no_threeUTR_cds)]
  #find range for 3UTR, update range of tmp3UTR
  ranges(tmp_3UTR) <- ranges(flank(tmp_3UTR,start = FALSE, both = FALSE, width = flanking_length_nt, use.names = TRUE))
  return(tmp_3UTR)
}

no_threeUTR_UTR <- endoapply(no_threeUTR_cds,add_3UTR,flanking_length_nt)

```

Extend UTRs of less than flanking length (250nt).
This bit takes a while... should depend on transcriptome size.

```{r extend_buffers}

#Find 5' UTR shorter than flanking length
short_5UTR <- fiveUTR[sum(width(fiveUTR)) < flanking_length_nt]

#function to extend 5'UTR
#ignores intron/exon boundaries currently
resize_short_5UTR <- function(short_5UTR,flanking_length_nt){
    #calculate extension needed
    extra = flanking_length_nt - sum(width(short_5UTR))
    #create extra range at start, combine with first range, update.
    #adds to start of first range for +, adds to end of first range for - strand
    ranges(short_5UTR[1]) <- ranges(range(flank(short_5UTR[1],start=TRUE,both=FALSE,width=extra),short_5UTR[1]))
    return(short_5UTR)
}

extended_short_5UTR <- endoapply(short_5UTR,resize_short_5UTR,flanking_length_nt)


#Find 3' UTR shorter than flanking length
short_3UTR <- threeUTR[sum(width(threeUTR)) < flanking_length_nt]

#function to extend 3'UTR
#ignores intron/exon boundaries currently
resize_short_3UTR <- function(short_3UTR,flanking_length_nt){
    #calculate extension needed
    extra = flanking_length_nt - sum(width(short_3UTR))
    #create extra range at end, combine with last range, update.
    #adds to end of last range for +, adds to start of last range for - strand
    ranges(short_3UTR[length(short_3UTR)]) <- ranges(range(flank(short_3UTR[length(short_3UTR)],
                                                                 start=FALSE,both=FALSE,width=extra),short_3UTR[length(short_3UTR)]))
    return(short_3UTR)
}

extended_short_3UTR <- endoapply(short_3UTR,resize_short_3UTR,flanking_length_nt)
```

Combine UTR list, check 5', 3' and CDS are same length. 
Combine all into one grangeslist.

```{r combine_UTR}

#Remove extended UTR from original three UTR - prevent duplicates.
threeUTR <- threeUTR[!names(threeUTR) %in% names(extended_short_3UTR)]
#Combine into one three UTR list
all_threeUTR <- c(threeUTR,extended_short_3UTR,no_threeUTR_UTR)

#Remove extended UTR from original five UTR - prevent duplicates.
fiveUTR <- fiveUTR[!names(fiveUTR) %in% names(extended_short_5UTR)]
#Combine into one five UTR list
all_fiveUTR <- c(fiveUTR,extended_short_5UTR,no_fiveUTR_UTR)

#CHECK - should have a 3UTR and 5UTR for all CDS, and all flanking length or longer
length(cds) == length(all_threeUTR)
length(cds) == length(all_fiveUTR)
all((sum(width(all_threeUTR)) >= flanking_length_nt))
all((sum(width(all_fiveUTR)) >= flanking_length_nt))

#Clear up env
rm(extended_short_3UTR,threeUTR,extended_short_5UTR,fiveUTR,no_fiveUTR_cds,no_fiveUTR_UTR,
   no_threeUTR_cds,no_threeUTR_UTR,short_3UTR,short_5UTR)
```

```{r get_sequences}

#Get rid of characters that will break R - EOF within quoted string error
names(cds) <- gsub("'","_",names(cds))
names(all_fiveUTR) <- gsub("'","_",names(all_fiveUTR))
names(all_threeUTR) <- gsub("'","_",names(all_threeUTR))

#Save cds and UTR as tmp bed files
export.bed(cds,"tmp_cds.bed")
export.bed(all_fiveUTR,"tmp_5UTR.bed")
export.bed(all_threeUTR,"tmp_3UTR.bed")

#Use bedtools to get sequences
#Split - splices together cds
cds_seq <- bedtoolsr::bt.getfasta(fi=genome_path, bed = "tmp_cds.bed",name = TRUE, split = FALSE)
three_seq <- bedtoolsr::bt.getfasta(fi=genome_path,bed = "tmp_3UTR.bed",name =TRUE, split = FALSE)
five_seq <- bedtoolsr::bt.getfasta(fi=genome_path,bed = "tmp_5UTR.bed",name =TRUE, split = FALSE)

#Check have expected no of sequences
length(cds_seq$V1)/2 == length(cds)
length(five_seq$V1)/2 == length(cds)
length(three_seq$V1)/2 == length(cds)

#Tidy IDs, and add quote marks back.
cds_seq$V1 <-  gsub(">","",cds_seq$V1)
cds_seq$V1 <-  gsub("_","'",cds_seq$V1)

five_seq$V1 <-  gsub(">","",five_seq$V1)
five_seq$V1 <-  gsub("_","'",five_seq$V1)

three_seq$V1 <-  gsub(">","",three_seq$V1)
three_seq$V1 <-  gsub("_","'",three_seq$V1)

```

Make new GFF based on fastas.
Three lines in GFF
YAL068C	rtracklayer	UTR5	1	250	.	+	.	Name=YAL068C
YAL068C	rtracklayer	CDS	251	613	.	+	.	Name=YAL068C
YAL068C	rtracklayer	UTR3	614	863	.	+	.	Name=YAL068C

```{r three_prime_UTR}
#Fastas are not read in same order
#Build dataframes for final GFF.
#five UTR
tmp_five_gff <- cbind.data.frame(five_seq$V1[seq(1, nrow(five_seq), 2)],
                              five_seq$V1[seq(2, nrow(five_seq), 2)])
colnames(tmp_five_gff) <- c("fasta_id","five_UTR")
tmp_five_gff$type <- "five_prime_UTR"
tmp_five_gff$start <- 1
tmp_five_gff$width <- width(as.character(tmp_five_gff$five_UTR))
tmp_five_gff$end <- width(as.character(tmp_five_gff$five_UTR))

#CDS
tmp_cds_gff <- cbind.data.frame(cds_seq$V1[seq(1, nrow(cds_seq), 2)],
                              cds_seq$V1[seq(2, nrow(cds_seq), 2)])
colnames(tmp_cds_gff) <- c("fasta_id","cds")
tmp_cds_gff$type <- "CDS"
tmp_cds_gff <- left_join(tmp_cds_gff,tmp_five_gff %>% select(fasta_id,end),
                         by = "fasta_id")
tmp_cds_gff$start <- tmp_cds_gff$end + 1
tmp_cds_gff$width <- width(as.character(tmp_cds_gff$cds))
tmp_cds_gff$end <- tmp_cds_gff$end + width(as.character(tmp_cds_gff$cds))  

#three UTR
tmp_three_gff <- cbind.data.frame(three_seq$V1[seq(1, nrow(three_seq), 2)],
                              three_seq$V1[seq(2, nrow(three_seq), 2)])
colnames(tmp_three_gff) <- c("fasta_id","three_UTR")
tmp_three_gff$type <- "three_prime_UTR"
tmp_three_gff <- left_join(tmp_three_gff,tmp_cds_gff %>% select(fasta_id,end),
                         by = "fasta_id")
tmp_three_gff$start <- tmp_three_gff$end + 1
tmp_three_gff$width <- width(as.character(tmp_three_gff$three_UTR))
tmp_three_gff$end <- tmp_three_gff$end + width(as.character(tmp_three_gff$three_UTR))


#make gffs
five_gff <- GRanges(seqnames = tmp_five_gff$fasta_id,
                         ranges = IRanges(start = tmp_five_gff$start,
                                          width = tmp_five_gff$width),
                         strand = "+",
                         type = tmp_five_gff$type,
                         Name = tmp_five_gff$fasta_id)

cds_gff <- GRanges(seqnames = tmp_cds_gff$fasta_id,
                         ranges = IRanges(start = tmp_cds_gff$start,
                                          width = tmp_cds_gff$width),
                         strand = "+",
                         type = tmp_cds_gff$type,
                         Name = tmp_cds_gff$fasta_id)

three_gff <- GRanges(seqnames = tmp_three_gff$fasta_id,
                         ranges = IRanges(start = tmp_three_gff$start,
                                          width = tmp_three_gff$width),
                         strand = "+",
                         type = tmp_three_gff$type,
                         Name = tmp_three_gff$fasta_id)

#Combine into final gff
final_gff <- c(five_gff,cds_gff,three_gff)

export.gff3(final_gff,con = paste(output_path,".gff3",sep=""))

```

One sequence per CDS eg 
>YAL068C
ACCTATGAAAGATTTATGATTCGTTCAGAAACAAGAGCATCTCCATAGAGATAATGAGATTGTGTGAAAGATGAGATATA

```{r make final fasta}

final_fasta <- select(tmp_five_gff,c("fasta_id","five_UTR"))
final_fasta <- left_join(final_fasta,tmp_cds_gff %>% select(fasta_id,cds),
                         by = "fasta_id")
final_fasta <- left_join(final_fasta,tmp_three_gff %>% select(fasta_id,three_UTR),
                         by = "fasta_id")
final_fasta$full_seq <- paste(final_fasta$five_UTR,final_fasta$cds,final_fasta$three_UTR,sep = "") 

#using function not mapply to ensure makes a new file

for (i in (1:length(final_fasta$fasta_id))){
  if (i == 1){
    write.fasta(sequences = final_fasta$full_seq[i], 
                names = final_fasta$fasta_id[i],
                file.out = paste(output_path,".fasta",sep=""),
                as.string = TRUE,
                open = "w")
  } else{
    write.fasta(sequences = final_fasta$full_seq[i], 
                names = final_fasta$fasta_id[i],
                file.out = paste(output_path,".fasta",sep=""),
                as.string = TRUE,
                open = "a")
  }
}


```

Then remove tmp files!