Skip to content

Commit

Permalink
adding some logic to catch if muscle doesn't produce an alignment (#101)
Browse files Browse the repository at this point in the history
  • Loading branch information
Mike Lee authored and Mike Lee committed Jan 31, 2025
1 parent ca1d9fe commit 2ad2d27
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 8 deletions.
75 changes: 70 additions & 5 deletions bin/GToTree
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[0;33m'
NC='\033[0m'
VERSION="v1.8.8"
VERSION="v1.8.9"

if [ "$1" == "--version" ] || [ "$1" == "-v" ]; then
printf "GToTree ${VERSION}\n"
Expand Down Expand Up @@ -1992,7 +1992,7 @@ if [ -n "$amino_acid_files" ]; then
cat $amino_acid_files | parallel -j $num_jobs gtt-amino-acid-parallel.sh {} $tmp_dir $hmm_file $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode $additional_pfam_targets ${ko_targets} ${target_KOs}

### kill backstop ###
# if there was a problem with the parallel fasta genome processing, killing main program here and reporting
# if there was a problem with the parallel amino acid genome processing, killing main program here and reporting
if [ -s ${tmp_dir}/kill_amino_acid_parallel.problem ]; then

problem_assembly=$(head -n 1 ${tmp_dir}/kill_amino_acid_parallel.problem)
Expand Down Expand Up @@ -2404,13 +2404,42 @@ if [ $num_jobs == "1" ]; then

# aligning
if [ $total_input_genomes -ge 1000 ] && [ $override_faster_alignment == 'false' ]; then
muscle -super5 ${tmp_dir}/${SCG}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/aligned.tmp -threads ${num_muscle_threads}
muscle -super5 ${tmp_dir}/${SCG}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/${SCG}-aligned.tmp -threads ${num_muscle_threads} | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
else
muscle -align ${tmp_dir}/${SCG}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/aligned.tmp -threads ${num_muscle_threads}
muscle -align ${tmp_dir}/${SCG}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/${SCG}-aligned.tmp -threads ${num_muscle_threads} | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
fi

# checking if alignment was successful (really this is a sloppy way of checking, but it's better than nothing and the muscle logs will be in the stdout and log)
if [ ! -s ${tmp_dir}/${SCG}-aligned.tmp ]; then

printf "\n\n ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ####${NC} GToTree is exiting without completing :( ${RED}####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ##############################################################################${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ${RED}############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

printf " ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " There was a problem with muscle generating an alignment, so GToTree is exiting. This\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " is most often due to running out of memory, leading to a 'core dumped' message from muscle.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " You can check the muscle log output printed above, specifically look for ${SCG}.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " If you can't access more memory, it would help to reduce the number of included\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " genomes if possible." | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ${RED}**************************************************************************** ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

# removing tmp directory unless debug set
if [ $debug_flag == 'false' ]; then
rm -rf $tmp_dir
fi

mv $gtotree_log ${output_dir}/gtotree-runlog.txt
exit

fi

# trimming
trimal -in ${tmp_dir}/aligned.tmp -out ${tmp_dir}/trimmed${target_gene_suffix}.tmp -automated1
trimal -in ${tmp_dir}/${SCG}-aligned.tmp -out ${tmp_dir}/trimmed${target_gene_suffix}.tmp -automated1

# removing linewraps:
sed 's/ .*$//' ${tmp_dir}/trimmed${target_gene_suffix}.tmp | awk '!/^>/ { printf "%s", $0; n="\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' > ${tmp_dir}/formatted${target_gene_suffix}.tmp
Expand Down Expand Up @@ -2465,6 +2494,42 @@ else

cat ${tmp_dir}/final_genes_list.tmp | parallel -j $num_jobs gtt-align-and-trim-parallel.sh {} $tmp_dir $faster_alignment $num_muscle_threads $target_gene_suffix

### kill backstop ###
# if there was a problem with the alignments, killing main program here and reporting

if [ -f ${tmp_dir}/kill_align_and_trim_parallel.problem ]; then

problem_alignment=$(head -n 1 ${tmp_dir}/kill_align_and_trim_parallel.problem)

printf "\n\n ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ####${NC} GToTree is exiting without completing :( ${RED}####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ##############################################################################${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ${RED}############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

printf " ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " There was a problem with muscle generating an alignment, so GToTree is exiting. This\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " is most often due to running out of memory, leading to a 'core dumped' message from muscle.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " You can check the muscle log output in one of the problem sets by looking at:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ${problem_alignment}-muscle.log. If you can't access more memory, it would help to\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " reduce the number of included genomes if possible." | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ${RED}**************************************************************************** ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

# copying muscle log file to primary working directory
cp ${tmp_dir}/${problem_alignment}-muscle.log .

# removing tmp directory unless debug set
if [ $debug_flag == 'false' ]; then
rm -rf $tmp_dir
fi

mv $gtotree_log ${output_dir}/gtotree-runlog.txt
exit

fi

fi

printf "\n\n\n________________________________________________________________________________\n\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
Expand Down
12 changes: 9 additions & 3 deletions bin/gtt-align-and-trim-parallel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,15 @@ gtt-parse-fasta-by-headers -i ${tmp_dir}/${1}_hits_filtered.tmp -w ${tmp_dir}/so

# aligning
if [ $faster_alignment == 'true' ]; then
muscle -super5 ${tmp_dir}/${1}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/${1}_aligned.tmp -threads ${num_muscle_threads} &> /dev/null
muscle -super5 ${tmp_dir}/${1}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/${1}_aligned.tmp -threads ${num_muscle_threads} > ${tmp_dir}/${1}-muscle.log 2>&1
else
muscle -align ${tmp_dir}/${1}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/${1}_aligned.tmp -threads ${num_muscle_threads} &> /dev/null
muscle -align ${tmp_dir}/${1}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/${1}_aligned.tmp -threads ${num_muscle_threads} > ${tmp_dir}/${1}-muscle.log 2>&1
fi

# checking if alignment was successful (really this is a sloppy way of checking, but it's better than nothing and the muscle log file will be available)
if [ ! -s ${tmp_dir}/${1}_aligned.tmp ]; then
printf "${1}\n" >> ${tmp_dir}/kill_align_and_trim_parallel.problem
exit
fi

# trimming
Expand All @@ -42,7 +48,7 @@ if [ -s ${tmp_dir}/${1}_needed_gappers.tmp ]; then

# getting length of the alignment for the current gene:
aln_length_tmp=$(sed -n '2p' ${tmp_dir}/${1}_formatted${target_gene_suffix}.tmp | wc -c | tr -s " " | cut -f2 -d " ")
# subtracting 1 for newline characters
# subtracting 1 for newline characters
aln_length_tmp=$(echo "$aln_length_tmp"-1 | bc)
# making a string of gaps the length of the alignment for those missing it:
gap_seq=$(printf "%0.s-" $(seq 1 1 $aln_length_tmp))
Expand Down

0 comments on commit 2ad2d27

Please sign in to comment.