forked from tomas-fer/HybPhyloMaker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
HybPhyloMaker3_generatepslx.sh
248 lines (227 loc) · 8.58 KB
/
HybPhyloMaker3_generatepslx.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#!/bin/bash
#----------------MetaCentrum----------------
#PBS -l walltime=1d
#PBS -l nodes=1:ppn=1
#PBS -j oe
#PBS -l mem=4gb
#PBS -l scratch=8gb
#PBS -N HybPhyloMaker3_generate_pslx
#PBS -m abe
#-------------------HYDRA-------------------
#$ -S /bin/bash
#$ -q sThC.q
#$ -l mres=1G
#$ -cwd
#$ -j y
#$ -N HybPhyloMaker3_generate_pslx
#$ -o HybPhyloMaker3_generate_pslx.log
# ********************************************************************************
# * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building *
# * Script 03 - Process consensus after mapping, make pslx files *
# * v.1.3.2 *
# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 *
# * tomas.fer@natur.cuni.cz *
# * based on Weitemier et al. (2014), Applications in Plant Science 2(9): 1400042*
# ********************************************************************************
# Input: Consensus sequences from Geneious: must be named consensus.fasta or consensus_cpDNA.fasta
# it is multiple fasta file with names,
# e.g., Camptandra-latifolia_S4-all-no-dups_assembled_to_Curcuma_exons_reference_400Ns__consensus_sequence)
# prepared in /storage/$server/home/$LOGNAME/data/30consensus/
#Complete path and set configuration for selected location
if [[ $PBS_O_HOST == *".cz" ]]; then
echo -e "\nHybPhyloMaker3 is running on MetaCentrum..."
#settings for MetaCentrum
#Copy file with settings from home and set variables from settings.cfg
cp -f $PBS_O_WORKDIR/settings.cfg .
. settings.cfg
. /packages/run/modules-2.0/init/bash
path=/storage/$server/home/$LOGNAME/$data
source=/storage/$server/home/$LOGNAME/HybSeqSource
othersourcepath=/storage/$server/home/$LOGNAME/$othersource
otherpslxpath=/storage/$server/home/$LOGNAME/$otherpslx
#Move to scratch
cd $SCRATCHDIR
#Add necessary modules
module add blat-suite-34
elif [[ $HOSTNAME == compute-*-*.local ]]; then
echo -e "\nHybPhyloMaker3 is running on Hydra..."
#settings for Hydra
#set variables from settings.cfg
. settings.cfg
path=../$data
source=../HybSeqSource
othersourcepath=../$othersource
otherpslxpath=../$otherpslx
#Make and enter work directory
mkdir -p workdir03
cd workdir03
#Add necessary modules
module load bioinformatics/blat/36x1
else
echo -e "\nHybPhyloMaker3 is running locally..."
#settings for local run
#set variables from settings.cfg
. settings.cfg
path=../$data
source=../HybSeqSource
othersourcepath=../$othersource
otherpslxpath=../$otherpslx
#Make and enter work directory
mkdir -p workdir03
cd workdir03
fi
#Setting for the case when working with cpDNA
if [[ $cp =~ "yes" ]]; then
echo -e "Working with cpDNA\n"
type="cp"
else
echo -e "Working with exons\n"
type="exons"
fi
#Check necessary file
echo -ne "Testing if input data are available..."
if [[ $cp =~ "yes" ]]; then
if [ -f "$path/$type/30consensus/consensus_cpDNA.fasta" ]; then
if [ -f "$source/$cpDNACDS" ]; then
echo -e "OK\n"
else
echo -e "'$cpDNACDS' is missing in 'HybSeqSource'. Exiting...\n"
rm -d ../workdir03/ 2>/dev/null
exit 3
fi
else
echo -e "'$path/$type/30consensus/consensus_cpDNA.fasta' is missing. Exiting...\n"
rm -d ../workdir03/ 2>/dev/null
exit 3
fi
else
if [ -f "$path/$type/30consensus/consensus.fasta" ]; then
if [ -f "$source/$probes" ]; then
echo -e "OK\n"
else
echo -e "'$probes' is missing in 'HybSeqSource'. Exiting...\n"
rm -d ../workdir03/ 2>/dev/null
exit 3
fi
else
echo -e "'$path/$type/30consensus/consensus.fasta' is missing. Exiting...\n"
rm -d ../workdir03/ 2>/dev/null
exit 3
fi
fi
#Test if folder for results exits
if [ -d "$path/$type/40contigs" ]; then
echo -e "Directory '$path/$type/40contigs' already exists. Delete it or rename before running this script again. Exiting...\n"
rm -d ../workdir03/ 2>/dev/null
exit 3
else
if [ -d "$path/$type/50pslx" ]; then
echo -e "Directory '$path/$type/50pslx' already exists. Delete it or rename before running this script again. Exiting...\n"
rm -d ../workdir03/ 2>/dev/null
exit 3
else
if [[ ! $location == "1" ]]; then
if [ "$(ls -A ../workdir03)" ]; then
echo -e "Directory 'workdir03' already exists and is not empty. Delete it or rename before running this script again. Exiting...\n"
rm -d ../workdir03/ 2>/dev/null
exit 3
fi
fi
fi
fi
#Copy fasta from home folder to scratch/workdir
cp -r $path/$type/30consensus/* .
#Make a new folder for results
mkdir -p $path/$type
mkdir $path/$type/40contigs
#-----------------------GENEIOUS CONSENSUS SEQUENCE MODIFICATION-----------------------
echo -en "Parsing consensus sequence..."
#Modify Windows EOLs to Unix EOLs (i.e., LF only)
if [[ $cp =~ "yes" ]]; then
sed -i.bak 's/\x0D$//' consensus_cpDNA.fasta
else
sed -i.bak 's/\x0D$//' consensus.fasta
fi
#Remove trailing '?'s, remove unwanted part of the file name (everything after the second '-'), add '_consensus_sequence' to all headers
#and split multiple fasta from Geneious into individual fasta sequences
if [[ $cp =~ "yes" ]]; then
cat consensus_cpDNA.fasta | sed 's/^?*//' | cut -d"-" -f1,2 | sed '/>/s/.*/&_consensus_cpDNAsequence/' | awk '/^>/ {OUT=substr($0,2) ".fasta"}; OUT {print >OUT}'
rm consensus_cpDNA.fasta
rm -f consensus.fasta
else
cat consensus.fasta | sed 's/^?*//' | cut -d"-" -f1,2 | sed '/>/s/.*/&_consensus_sequence/' | awk '/^>/ {OUT=substr($0,2) ".fasta"}; OUT {print >OUT}'
rm consensus.fasta
rm -f consensus_cpDNA.fasta
fi
#Former version
#cat consensus.fasta | sed 's/^?*//' | sed "s/-all-no-dups_assembled_to_${pseudoref}_//" | awk '/^>/ {OUT=substr($0,2) ".fasta"}; OUT {print >OUT}'
#Make a file with names of a all fasta files in a folder. Name is modified to consist only of first two parts of the name separated by '_'
ls *.fasta | cut -d'_' -f 1,2 > listOfConsensusFiles.txt
#A loop for preparing assemblies (consensus sequences) of individual exons from Geneious consensus
if [[ $cp =~ "yes" ]]; then
for file in $(cat listOfConsensusFiles.txt)
do
#Modify Windows EOLs to Unix EOLs (i.e., LF only)
#sed -i 's/.$//' $file\_consensus_cpDNAsequence.fasta
#Delete first line in a fasta file, i.e. header
sed -i.bak 1d $file\_consensus_cpDNAsequence.fasta
#Replace sequence of '?' by new line (\n), put bash variable ($file) to 'val' which is available to print with awk,
#print '>Contig'+number(NR;increased by one each step)+species name (val), then to next line print sequence
cat $file\_consensus_cpDNAsequence.fasta | tr -s '?' '\n' | awk -v val=$file '{ print ">Contig" NR "_" val "\n" $1 m}' > $file\_contigs_cpDNA.fas
#Copy data from scratch to home folder
cp $file\_contigs_cpDNA.fas $path/$type/40contigs
done
else
for file in $(cat listOfConsensusFiles.txt)
do
#Modify Windows EOLs to Unix EOLs (i.e., LF only)
#sed -i 's/.$//' $file\_consensus_sequence.fasta
#Delete first line in a fasta file, i.e. header
sed -i.bak 1d $file\_consensus_sequence.fasta
#Replace sequence of '?' by new line (\n), put bash variable ($file) to 'val' which is available to print with awk,
#print '>Contig'+number(NR;increased by one each step)+species name (val), then to next line print sequence
cat $file\_consensus_sequence.fasta | tr -s '?' '\n' | awk -v val=$file '{ print ">Contig" NR "_" val "\n" $1 m}' > $file\_contigs.fas
#Copy data from scratch to home folder
cp $file\_contigs.fas $path/$type/40contigs
done
fi
echo -e "finished\n"
#-----------------------BLAT ASSEMBLIES TO REFERENCE-----------------------
echo -e "Generating pslx files using BLAT...\n"
#Copy other transcriptome/genome data from home to scratch/workdir (must be named with suffix *.fas)
if [ "$othersource" != "" ] && [ "$othersource" != "NO" ]; then
cp -r $othersourcepath/* .
fi
if [[ $cp =~ "yes" ]]; then
#Copy cpDNA reference
cp -r $source/$cpDNACDS .
else
#Copy reference
cp -r $source/$probes .
fi
#Make a new folder for results
mkdir $path/$type/50pslx
#Make a list of all files with contigs
ls *.fas > contig_names.txt
#A loop to process all contig files specified in contig_names.txt
for contigfile in $(cat contig_names.txt)
do
echo -e "\nProcessing $contigfile..."
if [[ $cp =~ "yes" ]]; then
blat -t=DNA -q=DNA -out=pslx -minIdentity=$minident $cpDNACDS $contigfile ${contigfile}.pslx
else
blat -t=DNA -q=DNA -out=pslx -minIdentity=$minident $probes $contigfile ${contigfile}.pslx
fi
cp $contigfile.pslx $path/$type/50pslx
done
#Clean scratch/work directory
if [[ $PBS_O_HOST == *".cz" ]]; then
#delete scratch
if [[ ! $SCRATCHDIR == "" ]]; then
rm -rf $SCRATCHDIR/*
fi
else
cd ..
rm -r workdir03
fi
echo -e "\nScript HybPhyloMaker3 finished...\n"