-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathErassignment
executable file
·231 lines (173 loc) · 10.6 KB
/
Erassignment
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#!/bin/bash
### Erassignment v1.0.2 (C) Max Planck Society for the Advancement of Science
###
### Code developed by Josue Barrera-Redondo <josue.barrera@tuebingen.mpg.de>
###
### This program is free software: you can redistribute it and/or modify
### it under the terms of the GNU General Public License as published by
### the Free Software Foundation, either version 3 of the License, or
### (at your option) any later version.
###
### This program is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
### GNU General Public License for more details.
# USAGE: Erassignment [DIAMOND/BLAST results with the query sequence in the first column and the NCBI taxids in the last column] [modified "ncbi_lineages" file generated from ncbitax2lin] [taxid of query species] [percentage threshold of incomplete phylostrata] [/path/to/tmp_files] [boolean to print the deepest homolog for each gene] [/path/to/output_files] [boolean to FAST_STEP3 implementation] [optional table with NCBI taxonomy IDs and evolutionary distances for abSENSE]
# Example: Erassignment Diamond_results.txt ncbi_lineages_2021-10-26.csv 7227 30 /path/to/tmp true /path/to/output false evolutionary_distances.tsv
### IMPORTANT NOTE: the "ncbi_lineages" file should be arranged so that the taxid is in the first column and all the phylostrata of interest are organized from the species level all the way back to LUCA
DIAMONDOUT=$1
NCBITAX=$2
QTAXID=$3
THRESHOLD=$4
TMP_PATH=$5
PRINTOLDEST=$6
OUTDIR=$7
FAST_STEP3=$8
DIVERGENCE=$9
# Extract the philostrata corresponding to the query organism
QPHYLOS=$(awk -F "," -v QTAXID="$QTAXID" '{ if ($1 == QTAXID) print $0 }' ${NCBITAX})
# Extract the oldest assignable phylostratum for the query organism to limit the phylostratigraphic analysis to that level
OLDEST=$(echo $QPHYLOS | awk -F "," '{ print $NF }')
# Calculate total number of phylostrata to analyze to avoid infinite loops
PHYLONUM=$(echo $QPHYLOS | sed "s/,/\n/g" | wc -l)
# FOR loop to calculate the corresponding phylostrata for each gene
for GENE in $(cat ${TMP_PATH}/tmp_gene_list); do
if [[ "${FAST_STEP3}" = "false" ]]; then
# Generate a temporary file with the DIAMOND/MMseqs2 matches for the gene that is currently in the loop
grep "${GENE}" ${DIAMONDOUT} > ${TMP_PATH}/tmp_${GENE}.bout
fi
# Extract all the non-redundant taxids of the subject matches and extract their corresponding phylostrata. Limit the taxonomic search against sequences contained in the oldest phylostratum of the query organism
awk -v GENE="$GENE" '{ if ($1 == GENE) print $NF }' ${TMP_PATH}/tmp_${GENE}.bout | sed 's/;/\n/g' | sort -u -T ${TMP_PATH} | sed '/^[[:space:]]*$/d' | awk -F "," 'FNR==NR{ a[$1]=$0;next } ($1 in a)' - ${NCBITAX} | grep "$OLDEST" > ${TMP_PATH}/tmp_phylostrata
if [ -s ${TMP_PATH}/tmp_phylostrata ]; then
# Assign a rank to the phylostrata of the query organism
PHYLORANK=$(echo $QPHYLOS | sed "s/,/\n/g" | tail -n +2 | wc -l)
# Calculate the total number of taxonomic comparisons for a given gene
TOTALTAXA=$(cat ${TMP_PATH}/tmp_phylostrata | wc -l)
# Create a counter that increases throughout each passing phylostratum
COUNTER=2
# Loop throughout each phylostratum of the query organism
IFS=$'\n'
for STRATUM in $(echo $QPHYLOS | sed "s/,/\n/g" | tail -n +2); do
# Evaluate whether all the taxonomic comparisons match at each phylostratum, going from species level all the way back to LUCA
MATCHINGTAXA=$(awk -F "," -v COUNTER="$COUNTER" -v STRATUM="$STRATUM" '{ if ($COUNTER == STRATUM) print $0 }' ${TMP_PATH}/tmp_phylostrata | wc -l)
# If it matches at the species level:
if [ $TOTALTAXA == $MATCHINGTAXA ] && [ $COUNTER -eq 2 ]; then
echo -e "$GENE\t$STRATUM\t$PHYLORANK\t100" >> ${OUTDIR}/${QTAXID}_gene_ages.tsv
if [[ "${PRINTOLDEST}" = "true" ]]; then
sort -T ${TMP_PATH} -nk4 ${TMP_PATH}/tmp_${GENE}.bout | tail -1 >> ${OUTDIR}/${QTAXID}_deepest_homolog.tsv
fi
break
# Once all the taxonomic comparisons match a certain phylostratum above the species level, check the taxonomic representativeness in the previous strata to evaluate possible HGT events (or contamination)
elif [ $TOTALTAXA == $MATCHINGTAXA ] && [ $COUNTER -gt 2 ]; then
# Declare a boolean variable stating that the current phylostratum is complete:
INCOMPLETE=false
# Calculate the number of phylostrata levels that need to be evaluated (exluding the species-level, which can't be evaluated for completeness)
COVEREDSTRATA=$(($COUNTER - 2))
# Create an additional counter to calculate the number of complete phylostrata
GOODSTRATA=0
# Create an inverted counter to go all the way back to the shallower phylostrata
INVERTEDCOUNTER=$(($COUNTER - 1))
# If the user specifies it, extract the earliest-diverging match with the highest bitscore
if [[ "${PRINTOLDEST}" = "true" ]]; then
TAXLEVEL=$(echo $QPHYLOS | awk -F "," -v INVERTEDCOUNTER="$INVERTEDCOUNTER" '{ print $INVERTEDCOUNTER }')
awk -F "," -v INVERTEDCOUNTER="$INVERTEDCOUNTER" -v TAXLEVEL="$TAXLEVEL" '{ if ($INVERTEDCOUNTER != TAXLEVEL) print $1 }' ${TMP_PATH}/tmp_phylostrata | awk 'FNR==NR{ a[$1]=$0;next } ($NF in a)' - ${TMP_PATH}/tmp_${GENE}.bout | sort -T ${TMP_PATH} -nk4 | tail -1 >> ${OUTDIR}/${QTAXID}_deepest_homolog.tsv
fi
# Create another temporal file with the phylostrata that are to be pruned to assay completeness
cat ${TMP_PATH}/tmp_phylostrata > ${TMP_PATH}/tmp_sub_phylostrata
# Create an array to allocate all possible phylostrata to be assigned to the ambiguous genes
ARRAY=()
# while loop to calculate the completeness for each phylostratum
while [ $INVERTEDCOUNTER -ge 2 ]; do
TAXLEVEL=$(echo $QPHYLOS | awk -F "," -v INVERTEDCOUNTER="$INVERTEDCOUNTER" '{ print $INVERTEDCOUNTER }')
SUBTOTALTAXA=$(cat ${TMP_PATH}/tmp_sub_phylostrata | wc -l)
SUBMATCHINGTAXA=$(awk -F "," -v INVERTEDCOUNTER="$INVERTEDCOUNTER" -v TAXLEVEL="$TAXLEVEL" '{ if ($INVERTEDCOUNTER == TAXLEVEL) print $0 }' ${TMP_PATH}/tmp_sub_phylostrata | wc -l)
# If the phylostratum has representative taxa at that level other than that of the query organism
if [ $SUBTOTALTAXA -gt $SUBMATCHINGTAXA ]; then
# Prune away the taxa that do not match the taxonomic label of the query organism for that phylostratum, and mark the stratum as "complete" (i.e., with representatives outside the query organism's classification)
awk -F "," -v INVERTEDCOUNTER="$INVERTEDCOUNTER" -v TAXLEVEL="$TAXLEVEL" '{ if ($INVERTEDCOUNTER == TAXLEVEL) print $0 }' ${TMP_PATH}/tmp_phylostrata > ${TMP_PATH}/tmp_sub_phylostrata
let GOODSTRATA+=1
# If you find a complete phylostratum, but the previous one was incomplete, add it to the list of potential phylostrata for that gene and reset the completness boolean
if [ $INCOMPLETE ]; then
BACKTRACK=$(($INVERTEDCOUNTER + 1))
ARRAY+=$(echo $QPHYLOS | awk -F "," -v BACKTRACK="$BACKTRACK" '{ print $BACKTRACK"," }')
INCOMPLETE=false
fi
# If the strata is incomplete, then change the boolean variable
elif [ $SUBTOTALTAXA == $SUBMATCHINGTAXA ]; then
INCOMPLETE=true
fi
let INVERTEDCOUNTER-=1
# Add the last taxonomic level to the array if the previous level was incomplete (since the gene is obviously present in the query)
if [ $INVERTEDCOUNTER -lt 2 ]; then
if [ $INCOMPLETE ]; then
BACKTRACK=$(($INVERTEDCOUNTER + 1))
ARRAY+=$(echo $QPHYLOS | awk -F "," -v BACKTRACK="$BACKTRACK" '{ print $BACKTRACK"," }')
INCOMPLETE=false
fi
fi
done
rm ${TMP_PATH}/tmp_sub_phylostrata
PROPORTIONSTRATA=$((100 * $GOODSTRATA / $COVEREDSTRATA))
# If the younger phylostrata are complete enough, then assign the oldest matching phylostratum as the potential age of the gene
if [ $PROPORTIONSTRATA -gt $THRESHOLD ]; then
echo -e "$GENE\t$STRATUM\t$PHYLORANK\t$PROPORTIONSTRATA" >> ${OUTDIR}/${QTAXID}_gene_ages.tsv
# Otherwise flag the gene as potential contamination or HGT event.
else
echo -e "$GENE\tPossible contamination or HGT\tNA\t$PROPORTIONSTRATA" >> ${OUTDIR}/${QTAXID}_gene_ages.tsv
# Create a list of possible assignable phylostrata for these ambiguous genes
echo -e "$GENE\t${ARRAY[@]}" | sed 's/,$//' >> ${OUTDIR}/${QTAXID}_ambiguous_phylostrata.tsv
fi
break
# If the search for matching phylostrata doesn't reach a consesnsus at the oldest phylostratum, then assign the gene as putatively viral (this is no longer supposed to happen, but I kept it to avoid infinite loops).
elif [ $PHYLONUM == $COUNTER ]; then
echo -e "$GENE\tERROR IN PHYLOSTRATUM ASSIGNMENT, PLEASE CHECK FOR ANY INCONSISTENCIES IN THE \"ncbi_lineages\" FILE\tNA\tNA" >> ${OUTDIR}/${QTAXID}_gene_ages.tsv
echo -e "ERROR IN PHYLOSTRATUM ASSIGNMENT OF $GENE, PLEASE CHECK FOR ANY INCONSISTENCIES IN THE \"ncbi_lineages\" FILE"
break
# If there is not a complete match between the taxonomic comparisons and the phylostratum (i.e., the gene is older than that phylostratum), then go to a deeper taxonomic level and search again
else
let COUNTER+=1
let PHYLORANK-=1
fi
done
else
echo -e "$GENE\tAbsent from the DIAMOND/MMseqs2 results\tNA\tNA" >> ${OUTDIR}/${QTAXID}_gene_ages.tsv
if [[ "${PRINTOLDEST}" = "true" ]]; then
echo -e "$GENE\tno_matches\tNA\tNA\tNA\tNA\tNA">> ${OUTDIR}/${QTAXID}_deepest_homolog.tsv
fi
fi
rm ${TMP_PATH}/tmp_phylostrata
# If abSENSE is invoked, generate the bitscore input file
if [[ -f ${DIVERGENCE} ]]; then
echo -e "${GENE}" > ${TMP_PATH}/tmp_${QTAXID}_${GENE}_bitscore.tsv
IFS=$'\n'
for SPECIES in $(cat ${DIVERGENCE}); do
SP_ID=$(echo ${SPECIES} | cut -d " " -f1)
# Extract the bitscore from the top hit for that gene for each species in the divergence list
PUTATIVE_ORTHO=$(awk -v SP_ID="$SP_ID" '{ if ($NF == SP_ID) print $(NF-1) }' ${TMP_PATH}/tmp_${GENE}.bout | sort -V -T ${TMP_PATH} | tail -1)
if [[ -z ${PUTATIVE_ORTHO} ]]; then
echo -e "0" >> ${TMP_PATH}/tmp_${QTAXID}_${GENE}_bitscore.tsv
else
echo -e "${PUTATIVE_ORTHO}" >> ${TMP_PATH}/tmp_${QTAXID}_${GENE}_bitscore.tsv
fi
done
# Generate the bitscore input file for abSENSE
awk '
{
for (i=1; i<=NF; i++) {
a[NR,i] = $i
}
}
NF>p { p = NF }
END {
for(j=1; j<=p; j++) {
str=a[1,j]
for(i=2; i<=NR; i++){
str=str" "a[i,j];
}
print str
}
}' ${TMP_PATH}/tmp_${QTAXID}_${GENE}_bitscore.tsv | sed 's/ /\t/g' >> ${OUTDIR}/${QTAXID}_bitscores.tsv
rm ${TMP_PATH}/tmp_${QTAXID}_${GENE}_bitscore.tsv
fi
rm ${TMP_PATH}/tmp_${GENE}.bout
done