-
Notifications
You must be signed in to change notification settings - Fork 0
/
sRNA.Mapper.range_1830nt.sh
116 lines (68 loc) · 4.2 KB
/
sRNA.Mapper.range_1830nt.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/bin/bash
#Juan Santos, 2022 November
##################################################################################
########### Reference files to be used and working dir ###################
##################################################################################
#genomic assembly in fasta format
genome_assembly_fasta="/home/jsantos/genome_ref/TAIR10/bowtie1/TAIR10_chr_all.fasta"
#genomic chromosome sizes txt file
genome_chr_sizes="/home/jsantos/genome_ref/TAIR10/TAIR10.chrom.sizes"
#bowtie1 indexes to be used
genome_non_sRNA_bw_idx="/home/jsantos/genome_ref/TAIR10/sRNA_filter/non_sRNA"
genome_bw_idx="/home/jsantos/genome_ref/TAIR10/bowtie1/TAIR10"
#annotation files in bed format
genes_annot_bed="/home/jsantos/genome_ref/TAIR10/bed_landscape/TAIR10_genes_isoform1_annot.bed"
tes_annot_bed="/home/jsantos/genome_ref/TAIR10/bed_landscape/TAIR_transposable_elements_6col.bed"
##################################################################################
#working directory where all the sample directories are allocated
working_dir="/media/diskb/nicolas_tmp/REPLICATES_TOTAL"
#this defines the samples to be analysed (dir names)
sample_list="mutant_rep1 mutant_rep2 wt_rep1 wt_rep2";
#sample_list="test_rep1";
##################################################################################
### Step 1 . Selecting sRNA size_1830 range and mapping
for sample in $sample_list ; do
cd $working_dir/${sample};
mkdir size_1830; cd size_1830;
#this cuts without adapter in desired size
cutadapt --no-trim -m 18 -M 30 -o sample.size_sel.fastq ../sample.perfect.fastq > summary_NOtrimming.txt;
#Mapping with protocol for handling multi-mapped reads is f (fractional-seeded guide)
ShortStack --align_only --bowtie_cores 20 --mismatches 0 --nohp --mmap f --outdir shortstack --genome $genome_assembly_fasta --readfile sample.size_sel.fastq;
done
wait
### Step 2 . Calculating coverage and normalizing it to RPM.
for sample in $sample_list ; do
cd $working_dir/${sample}/size_1830/shortstack;
samtools sort -@ 20 sample.size_sel.bam -o sample.size_sel.sorted.bam;
samtools index sample.size_sel.sorted.bam;
#transforms bam to bed
bamToBed -i sample.size_sel.sorted.bam > sample.size_sel.sorted.bed;
#calculation normalization factor, reads per million
librarySize=$(wc -l ../../sample.perfect.txt | awk '{total+=$1}END{print total}');
b=1000000;
factor=`echo "$b / $librarySize "|bc -l`;
#Creates a normalised coverage file in bed format (4 columns bed)
bedtools genomecov -i sample.size_sel.sorted.bed -g $genome_chr_sizes -bga -scale $factor | grep -v "mitochondria\|chloroplast" > sample.coverage.bed;
done
wait
### Step 3 . Calculating diverse final output; bedGraph, RPM values on genomic features and more
for sample in $sample_list ; do
cd $working_dir/${sample}/size_1830/shortstack;
#Creates a bedGraph file with normalized values
bedtools genomecov -ibam sample.size_sel.sorted.bam -bg -scale $factor > sample.bedGraph&
#Creates a coverage file for differential expression of genes with read counts (not normalised) per gene
bedtools multicov -D -bams sample.size_sel.sorted.bam -bed $genes_annot_bed> gene.counts.txt&
#Creates a coverage file for differential expression of TEs with read counts (not normalised) per TE
bedtools multicov -D -bams sample.size_sel.sorted.bam -bed $tes_annot_bed> te.counts.txt&
#Calculates genomic RPM values for genes
bedtools map -a $genes_annot_bed -b sample.coverage.bed -c 4 -o mean>genomic_RPM_values.genes.txt&
#Calculates genomic RPM values for tes
bedtools map -a $tes_annot_bed -b sample.coverage.bed -c 4 -o mean>genomic_RPM_values.tes.txt&
#making list of reads id with sequences (two columns, id after sample.size_sel.fastq)
samtools view sample.size_sel.sorted.bam| cut -f 1,10 >srna_ids.txt&
#extracting reads mapping into genes (column 4 is read id after sample.size_sel.fastq)
intersectBed -abam sample.size_sel.sorted.bam -b $genes_annot_bed -wa -wb -bed|cut -f 1,2,3,4,13,14,15,16> gene.reads.txt&
#extracting reads mapping into TEs (column 4 is read id after sample.size_sel.fastq)
intersectBed -abam sample.size_sel.sorted.bam -b $tes_annot_bed -wa -wb -bed|cut -f 1,2,3,4,13,14,15,16> te.reads.txt&
done
wait