genEra

#!/bin/bash

### genEra v1.4.2 (C) Max Planck Society for the Advancement of Science
###
### Code developed by Josue Barrera-Redondo <josue.barrera@tuebingen.mpg.de>
###
### This program is free software: you can redistribute it and/or modify
### it under the terms of the GNU General Public License as published by
### the Free Software Foundation, either version 3 of the License, or
### (at your option) any later version.
###
### This program is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
### GNU General Public License for more details.

VERSION='1.4.2'

QUERY_FASTA=''
QUERY_PDB=''
NCBITAX=''
NR_DB=''
ALPHAFOLD_DB=''
THREADS=$(nproc)
THRESHOLD='30'
DIAMONDOUT=''
TAXDUMP=''
RAWTAX=''
CUSTOMTAX=''
CUSTOMDATA=''
CUSTOM_PDB=''
CUSTOMGENOMES=''
EVALUE='1e-5'
FOLDSEEK_MAXSEQS='10000'
DIAMONDOPTS=''
MINHITS='10'
PRINTOLDEST='false'
DIVERGENCE=''
OUTDIR=$(pwd)
TMP_PATH=''
SENSITIVITY='sensitive'
JACKHMMER='false'
HMMERTEST='2'
STRAINS=''
INPUT_TREE=''
FAST_STEP3='true'

print_usage() {
  printf "\ngenEra v${VERSION} (C) Max Planck Society for the Advancement of Science\n\n  BASIC USAGE\n\tgenEra -q [query_sequences.fasta] -t [query_taxid] -b [path/to/nr] -d [path/to/taxdump]\n     OR\n\tgenEra -Q [query_structure_directory] -t [query_taxid] -B [path/to/AlphaFold_DB] -d [path/to/taxdump]\n\n  DESCRIPTION\n\tgenEra is an easy-to-use, low-dependency command-line tool that\n\testimates the age of the earliest common ancestor of protein\n\tcoding genes though genomic phylostratigraphy.\n\n  MANDATORY ARGUMENT ALWAYS\n\t-t\tNCBI Taxonomy ID of query species (search for the taxid of\n\t\tyour query species at https://www.ncbi.nlm.nih.gov/taxonomy)\n\n  OPTIONS FOR MANDATORY INPUT FILE\n\t-q\tFile with query protein sequences in FASTA format to perform\n\t\tpairwise sequence alignments using DIAMOND\n\t-Q\tDirectory with query structural predictions in PDB format\n\t\tto perform pairwise structural alignments using Foldseek\n\n  OPTIONS FOR MANDATORY DATABASE\n\t-b\tPath to a locally installed database for DIAMOND (FASTA only)\n\t-B\tPath to a locally installed database for Foldseek (PDB only)\n\t-p\tPre-generated DIAMOND/Foldseek table (skip step 1), with the\n\t\tquery genes in the first column, the bitscore in the second\n\t\tto last column and the target taxid in the last column\n\t\t(IMPORTANT: the query sequences must also be searched\n\t\tagainst themselves for genEra to work properly)\n\n  OPTIONS FOR MANDATORY TAXONOMY FILE\n\t-d\tLocation of the uncompressed taxonomy dump from the NCBI\n\t\t(ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz)\n\t-r\tRaw \"ncbi_lineages\" file generated by ncbitax2lin\n\t\t(saves time on step 2)\n\t-c\tCustom \"ncbi_lineages\" file that is already tailored for the\n\t\tquery species (skip step 2). The table should be arranged so\n\t\tthat the taxid is in the first column and all the phylostrata\n\t\tof interest are organized from the species level all the way\n\t\tback to \"cellular organisms\"\n\n  IMPORTANT OPTIONAL ARGUMENTS\n\t-n\tNumber of threads to run genEra (genEra can run with a single\n\t\tthread, but it is HIGHLY suggested to use as many threads as\n\t\tpossible) (default: all available threads)\n\t-o\tDirectory to save output files (default: working directory)\n\t-v\tList with protein FASTA files to perform an infraspecies-level\n\t\tanalysis against the query proteome. Particularly useful for\n\t\tdetecting candidate de novo genes. The user can give GenEra a\n\t\tprecomputed phylogeny with -z or let GenEra estimate the\n\t\tevolutionary relationship between the listed proteomes using\n\t\tOrthoFinder. Make sure the file names are unique and recognizable.\n\t-z\tPrecomputed phylogeny in NEWICK format for the infraspecies-level\n\t\tanalysis. Make sure that the names in the phylogeny match the\n\t\tprefixes of the FASTA files (-v).\n\t-s\tTable with pairwise evolutionary distances (substitutions/site)\n\t\tbetween several species in the database and the query species\n\t\t(necessary to calculate homology detection failure probabilities\n\t\twith abSENSE). NOTE: the query species SHOULD be included in\n\t\tthis table. The table should be tab-delimited and have the\n\t\tfollowing format:\n\t\t   query_sp_taxid\t0\n\t\t   species_1_taxid\tdistance_1\n\t\t   species_2_taxid\tdistance_2\n\t-a\tTable with additional proteins to be included in the analysis\n\t\t(e.g., proteins from species that are absent from the nr).\n\t\tUse this option when using FASTA sequences as input.\n\t\tThe table should be tab-delimited and have the following format:\n\t\t   /path/to/species_1.fasta\ttaxid_1\n\t\t   /path/to/species_2.fasta\ttaxid_2\n\t\t   /path/to/species_3.fasta\ttaxid_3\n\t-f\tTable with additional nucleotide sequences to search against\n\t\tyour query proteins. Particularly useful with genome assemblies\n\t\tfor improved orphan gene classification. Use this option when\n\t\tusing FASTA sequences as input. The table should be\n\t\ttab-delimited and have the following format:\n\t\t   /path/to/genome_1.fasta\ttaxid_1\n\t\t   /path/to/genome_2.fasta\ttaxid_2\n\t\t   /path/to/genome_3.fasta\ttaxid_3\n\t-A\tTable with additional structural predictions to be included\n\t\tin the analysis (e.g., structural predictions from species\n\t\tthat are absent from the AlphaFold DB). Use this option when\n\t\tusing PDB files as input. The table should be tab-delimited\n\t\tand have the following format:\n\t\t   /path/to/species_directory_1\ttaxid_1\n\t\t   /path/to/species_directory_2\ttaxid_2\n\t\t   /path/to/species_directory_3\ttaxid_3\n\t-i\tWhen true, prints an additional output file with the best\n\t\tsequence hit responsible for the oldest gene age assignment\n\t\tfor each of the query genes (default: true)\n\t-F\tSpeed up STEP 3 gen age assignment at the cost of using\n\t\tmore RAM, e.g 200 GB RAM for 180 GB STEP 1 .bout output,\n\t\tand temporary files, as much temp files as the number of\n\t\tquery genes. Please take into account the requeriments before\n\t\trunning. Set -F false to run genEra with fewer resources at the\n\t\tcost of longer running times. Speed up of x ~ 5 times faster per\n\t\t10,000 query genes (-F true vs -F false). Speed improvement\n\t\twill increase gradually with bigger queries. (default: true)\n\n  FINE-TUNNING ARGUMENTS (DEFAULT IS USUALLY FINE)\n\t-l\tTaxonomic representativeness threshold below which a gene will\n\t\tbe flagged as putative genome contamination or the product of\n\t\ta horizontal gene transfer (HGT) event (default: 30)\n\t-e\tE-value threshold for DIAMOND and Foldseek (default: 1e-5)\n\t-u\tAdditional options to feed DIAMOND or Foldseek, based on user\n\t\tpreferences (filtering  hits by identity or query coverage)\n\t\tUsers should input the additional commands in quotes, using\n\t\tthe original arguments from DIAMOND (Example: -o \"--id 30\")\n\t\tor from Foldseek (Example: -o \"-c 0.8 --cov-mode 0\")\n\t-m\tMinimum percentage of matches between your query sequences\n\t\tand any species within a taxonomic level to consider it useful\n\t\tfor the gene age assignment (i.e., filtering taxonomic levels\n\t\tlacking complete genomes in the database) (default: 10)\n\t-x\tAlternative path where you would like to store the temporary\n\t\tfiles as well as the DIAMOND/Foldseek results (warning: genEra\n\t\twill generate HUGE temporary files) (default: the files will\n\t\tbe stored in a tmp_[RAMDOMNUM]/ directory created by genEra)\n\t-y\tModify the sensitivity parameter in DIAMOND for faster\n\t\tor more sensitive results in step 1 (default: sensitive)\n\t-M\tAdjust the amount of prefilter made by Foldseek (i.e., the\n\t\tmaximum number of hits that are reported) (default: 10000)\n\t-j\tWhen true, run an additional search step using jackhmmer\n\t\tagainst the online Uniprot database (default: false)\n\t\t(warning 1: Only available when using FASTA sequences)\n\t\t(warning 2: this step requires internet connection)\n\t-k\tStarting taxonomic rank of genes that will be re-analyzed\n\t\tusing jackhmmer (default: 2)\n\t-h\tPrint this help message and exit\n\n"
}

while getopts ':q:Q:t:b:B:n:l:p:d:r:c:a:A:f:e:u:m:M:i:s:o:x:y:j:k:v:z:F:h' flag; do
  case "${flag}" in
    q) QUERY_FASTA=${OPTARG} ;;
    Q) QUERY_PDB=${OPTARG} ;;
    t) NCBITAX=${OPTARG} ;;
    b) NR_DB=${OPTARG} ;;
    B) ALPHAFOLD_DB=${OPTARG} ;;
    n) THREADS=${OPTARG} ;;
    l) THRESHOLD=${OPTARG} ;;
    p) DIAMONDOUT=${OPTARG} ;;
    d) TAXDUMP=${OPTARG} ;;
    r) RAWTAX=${OPTARG} ;;
    c) CUSTOMTAX=${OPTARG} ;;
    a) CUSTOMDATA=${OPTARG} ;;
    A) CUSTOM_PDB=${OPTARG} ;;
    f) CUSTOMGENOMES=${OPTARG} ;;
    e) EVALUE=${OPTARG} ;;
    u) DIAMONDOPTS=${OPTARG} ;;
    m) MINHITS=${OPTARG} ;;
    M) FOLDSEEK_MAXSEQS=${OPTARG} ;;
    i) PRINTOLDEST=${OPTARG} ;;
    s) DIVERGENCE=${OPTARG} ;;
    o) OUTDIR=${OPTARG} ;;
    x) TMP_PATH=${OPTARG} ;;
    y) SENSITIVITY=${OPTARG} ;;
    j) JACKHMMER=${OPTARG} ;;
    k) HMMERTEST=${OPTARG} ;;
    v) STRAINS=${OPTARG} ;;
    z) INPUT_TREE=${OPTARG} ;;
	F) FAST_STEP3=${OPTARG} ;;
    h) print_usage ; exit;;
    *) echo ; echo "  ERROR: One or more invalid arguments"; print_usage ; exit;;
  esac
done

# Check if Erassingnation.sh was added to the PATH
PHYLOSCRIPT=$(which Erassignment)
 if [[ -z ${PHYLOSCRIPT} ]]; then
	echo
	echo "  ERROR: Please make sure that Erassignment is located in your PATH"
	echo
	exit 1
fi

# Check if mcl is correctly installed
MCLEXEC=$(which mcl)
MCXLOADEXEC=$(which mcxload)
MCXDUMPEXEC=$(which mcxdump)
if [[ -z ${MCLEXEC} ]] || [[ -z ${MCXLOADEXEC} ]] || [[ -z ${MCXDUMPEXEC} ]]; then
	echo
	echo "  ERROR: Please make sure that mcl, mcxload and mcxdump are located in your PATH"
	echo
	exit 1
fi

# If using a FASTA file, check if DIAMOND is correctly installed and with version 2 onwards
if [[ -f $QUERY_FASTA ]]; then
	DIAMONDEXEC=$(which diamond)
	if [[ -z ${DIAMONDEXEC} ]]; then
		echo
		echo "  ERROR: Please make sure that diamond is located in your PATH"
		echo
		exit 1
	fi

	DIAMONDVERSION=$(diamond help | head -1 | cut -d' ' -f2 | cut -d. -f1 | sed 's/v//g')
	if [[ ${DIAMONDVERSION} -lt 2 ]]; then
		echo
		echo "  ERROR: The installed version of DIAMOND is outdated"
		echo "  Please update DIAMOND to version 2.0.0 or higher"
		echo
		exit 1
	fi
fi
# Check if Foldseek is correctly installed
if [[ -d $QUERY_PDB ]]; then
	FOLDSEEKEXEC=$(which foldseek)
	if [[ -z ${FOLDSEEKEXEC} ]]; then
		echo
		echo "  ERROR: Please make sure that foldseek is located in your PATH"
		echo
		exit 1
	fi
fi

# If running the infraspecies-level analysis, make sure tree2ncbitax and OrthoFinder are installed
if [[ -f $STRAINS ]]; then
	ORTHOFINDEREXEC=$(which orthofinder)
	TREESCRIPT=$(which tree2ncbitax)
	if [[ -z ${TREESCRIPT} ]]; then
		echo
		echo "  ERROR: The infraspecies-level analysis was invoked (-v), but tree2ncbitax in missing. Make sure that tree2ncbitax is located in your PATH"
		echo
		exit 1
	fi

	if [ ! "$INPUT_TREE" ]; then
		if [[ -z ${ORTHOFINDEREXEC} ]]; then
			echo
			echo "ERROR: The infraspecies-level analysis was invoked without an specified tree, but orthofinder is missing. Make sure that orthofinder is located in your PATH, or feed genEra with a pre-computed phylogeny (-z) for the analysis"
			echo
			exit 1
		fi
	fi
fi

# If FAST_STEP3 is turned true check if FASTSTEP3R scripts is added to the PATH
if [[ "${FAST_STEP3}" = "true" ]]; then
	F3R=$(which FASTSTEP3R)
   if [[ -z ${F3R} ]]; then
		echo
		echo "  ERROR: Please make sure that FASTSTEP3R script is located in your PATH"
		echo
		exit 1
   fi
fi

# Check if the mandarory arguments were used
if [ ! "$QUERY_FASTA" ] && [ ! "$QUERY_PDB" ]; then
	echo
	echo "  ERROR: The user must provide either a protein multi-FASTA file (-q) or a directory containing protein PDB files (-Q)"
	print_usage >&2; exit 1
fi

if [ ! "$NCBITAX" ]; then
	echo
	echo "  ERROR: The user must provide a NCBI taxonomy ID of the query species (-t)"
	print_usage >&2; exit 1
fi

# Avoid using FASTA and PDB files at the same time
if [[ -f $QUERY_FASTA ]] && [[ -d $QUERY_PDB ]]; then
	echo
	echo "  ERROR: The user must NOT provide a FASTA file (-q) and a directory with PDB files (-Q) at the same time"
	echo "  Choose one or the other"
	print_usage >&2; exit 1
fi

# Quick check to verify that the input sequences exist
if [ "$QUERY_FASTA" ]; then
	if [[ ! -f $QUERY_FASTA ]]; then
		echo
		echo "  ERROR: The query FASTA file ${QUERY_FASTA} does not exist"
		print_usage >&2; exit 1
	fi
fi

if [ "$QUERY_PDB" ]; then
	if [[ ! -d $QUERY_PDB ]]; then
		echo
		echo "  ERROR: The query directory with PDB files ${QUERY_PDB} does not exist"
		print_usage >&2; exit 1
	fi

	PDBSIZE=$(ls -l ${QUERY_PDB} | head -1 | cut -d " " -f2)
	if [ $PDBSIZE -eq 0 ]; then
		echo
		echo "  ERROR: The query directory ${QUERY_PDB} is empty"
		print_usage >&2; exit 1
	fi
fi

# Check whether the user specified the other mandarory arguments
if [ "$QUERY_FASTA" ]; then
	if [ -z "$NR_DB" ] && [ -z "$DIAMONDOUT" ]; then
	        echo
	        echo "  ERROR: The user must provide either the path to a local DIAMOND database (-b) or a pre-generated DIAMOND/Foldseek table (-p)"
	        print_usage >&2; exit 1
	fi
fi

if [ "$QUERY_PDB" ]; then
	if [ -z "$ALPHAFOLD_DB" ] && [ -z "$DIAMONDOUT" ]; then
		echo
		echo "  ERROR: The user must provide either the path to a local Foldseek database (-b) or a pre-generated DIAMOND/Foldseek table (-p)"
		print_usage >&2; exit 1
	fi
fi

if [ -z "$TAXDUMP" ] && [ -z "$RAWTAX" ] && [ -z "$CUSTOMTAX" ]; then
	echo
	echo "  ERROR: One of the following arguments must be provided (-d, -r or -c)"
	print_usage >&2; exit 1
fi

# If homology detection failure is invoked, make sure that abSENSE is installed and that the input file exists
if [[ -f ${DIVERGENCE} ]]; then

	ABSENSE_SCRIPT=$(which Run_abSENSE.py)
	if [[ -z ${ABSENSE_SCRIPT} ]]; then
		echo
		echo "  ERROR: Run_abSENSE.py was invoked (-s), but the script is not located in your PATH"
		echo "  Please make sure to install it correctly"
		echo
		exit 1
	fi

	if [[ ! -f ${DIVERGENCE} ]]; then
		echo
		echo "  ERROR: genEra could not find the file with the pairwise evolutionary distances:"
		echo "  ${DIVERGENCE}"
		echo "  Exiting"
		exit 1
	fi

fi

echo "genEra v${VERSION} (C) Max Planck Society for the Advancement of Science"
echo "Starting time of run:"
date

# Create output directory if it doesn't exist
if [[ ! -d ${OUTDIR} ]]; then
	mkdir -p ${OUTDIR}
fi

# create a temporary directory if no specific path is given by the user
if [[ -z ${TMP_PATH} ]]; then
	RANDOMSEED=$(echo $RANDOM)
	mkdir tmp_${NCBITAX}_${RANDOMSEED} && cd tmp_${NCBITAX}_${RANDOMSEED}
	TMP_PATH=$(pwd)
	cd ../
else
	RANDOMSEED=$(echo $RANDOM)
	RUNNINGPATH=$(pwd)
	cd ${TMP_PATH} && mkdir tmp_${NCBITAX}_${RANDOMSEED} && cd tmp_${NCBITAX}_${RANDOMSEED}
	TMP_PATH=$(pwd)
	cd ${RUNNINGPATH}
fi

echo
echo "Your temporary files will be stored in ${TMP_PATH}"

# Check whether the FASTA sequences are gzipped or not
if [[ -f ${QUERY_FASTA} ]]; then
	FASTACHECK=$(file ${QUERY_FASTA} | cut -d " " -f2)

	if [[ ${FASTACHECK} == "symbolic" ]]; then
		QUERY_REALPATH=$(ls -l ${QUERY_FASTA} | awk '{ print $11 }')
		QUERY_FASTA="${QUERY_REALPATH}"
		FASTACHECK=$(file ${QUERY_FASTA} | cut -d " " -f2)
	fi

	if [[ ${FASTACHECK} == "gzip" ]]; then
		FASTA_PREFIX=$(echo "${QUERY_FASTA##*/}" | cut -d. -f1)
		zcat < ${QUERY_FASTA} > ${TMP_PATH}/${FASTA_PREFIX}.fasta
		QUERY_FASTA="${TMP_PATH}/${FASTA_PREFIX}.fasta"
	fi

	QUERYSIZE1=$(grep -c ">" ${QUERY_FASTA})
	QUERYSIZE2=$(wc -l ${QUERY_FASTA} | cut -d" " -f1)
	if [ "$QUERYSIZE1" -lt 1 ] || [ "$QUERYSIZE2" -lt 2 ]; then
		echo
		echo "  ERROR: The protein sequences in ${QUERY_FASTA} are not in FASTA format, or the file is empty"
		print_usage >&2; exit 1
	fi
fi

#######################################################
###                 GenEra: STEP 1                  ###
#######################################################

if [ -n "$DIAMONDOUT" ]; then

	# Check that the pre-generated DIAMOND file exists
	if [[ ! -f ${DIAMONDOUT} ]]; then
		echo
		echo "  ERROR: genEra could not find the pre-generated DIAMOND/Foldseek file:"
		echo "  ${DIAMONDOUT}"
		echo "  Exiting"
		exit 1
	fi

	# Check that the pre-generated DIAMOND file is not empty
	if [ -s ${DIAMONDOUT} ]; then
		if [[ -f ${QUERY_FASTA} ]]; then
			echo
			echo "DIAMOND OUTPUT ALREADY GENERATED. SKIPPING STEP 1"
			echo
			echo "We're just going to quickly cluster the query genes against themselves for later on (step 3)"

			# Generate DIAMOND database for self-match
			diamond makedb \
			 --in ${QUERY_FASTA} \
			 --db ${TMP_PATH}/tmp_selfmatch_db --quiet

			# Create a self-match table for the clustering analysis
			diamond blastp --${SENSITIVITY} \
			 --query ${QUERY_FASTA} \
			 --db ${TMP_PATH}/tmp_selfmatch_db \
			 --outfmt 6 qseqid sseqid evalue bitscore \
			 --evalue ${EVALUE} --max-target-seqs 0 \
			 --threads ${THREADS} --out ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout --quiet ${DIAMONDOPTS}

			# Create input file for MCL
			cut -f 1,2,3 ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout > ${TMP_PATH}/tmp_${NCBITAX}.abc

			# Remove temporary files
			rm ${TMP_PATH}/tmp_selfmatch_db* ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout

		elif [[ -d ${QUERY_PDB} ]]; then
			echo
			echo "FOLDSEEK OUTPUT ALREADY GENERATED. SKIPPING STEP 1"
			echo
			echo "We're just going to quickly cluster the query genes against themselves for later on (step 3)"

			# Generate Foldseek database for self-match
			foldseek createdb ${QUERY_PDB} ${TMP_PATH}/tmp_${NCBITAX}_DB -v 0
			foldseek search ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_${NCBITAX}_aln ${TMP_PATH}/tmp_${NCBITAX}_foldseek -e ${EVALUE} --max-seqs ${FOLDSEEK_MAXSEQS} --threads ${THREADS} -v 0 ${DIAMONDOPTS}
			foldseek convertalis ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_${NCBITAX}_aln ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout --format-output "query,target,evalue,bits" -v 0

			# Create input file for MCL
			cut -f 1,2,3 ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout > ${TMP_PATH}/tmp_${NCBITAX}.abc

			# Remove temporary files
			rm ${TMP_PATH}/tmp_${NCBITAX}_DB* ${TMP_PATH}/tmp_${NCBITAX}_aln* ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout
			rm -r ${TMP_PATH}/tmp_${NCBITAX}_foldseek/

		fi
	# If pre-generated DIAMOND file is empty, show error mesage and exit
	else
		echo
		echo "  ERROR: The pre-generated DIAMOND/MMseqs2 file exists, but it is empty"
		echo "  Exiting"
		echo
		exit 1
	fi
elif [[ -f ${QUERY_FASTA} ]]; then

	if [[ ! -f ${NR_DB}.dmnd ]]; then
		echo
		echo "  ERROR: genEra could not find the locally installed database for DIAMOND:"
		echo "  ${NR_DB}.dmnd"
		echo "  Please refer to the manual for information on how to generate such database"
		echo "  Exiting"
		exit 1
	fi

	echo
	echo "STARTING STEP 1: SEARCHING FOR HOMOLOGS WITHIN THE DATABASE USING DIAMOND"

	# Check whether DIAMOND can be found by genEra
	DIAMONDPATH=$(which diamond)
	if [[ -z ${DIAMONDPATH} ]]; then
		echo
		echo "  ERROR: genEra could not find the diamond excecutable. Unable to run step 1"
		echo "  Please make sure that DIAMOND is properly installed and located in the PATH"
		echo "  Exiting"
		exit 1
	fi

	# Start with the infraspecies analysis (in case the user gives the necessary input)
	if [[ -f ${STRAINS} ]]; then

		echo "--------------------------------------------------"
		echo "Infraspecies-level analysis was invoked!"
		echo "Adding infraspecies nodes to the analysis based on a phylogenetic tree"

		if [[ -f ${INPUT_TREE} ]]; then

			echo "The phylogenetic tree was provided by the user, skipping the phylogenetic inference"

		else

			echo "Running OrthoFinder to predict the phylogenetic relationships between genomes"

			# Create a temporary directory for the orthofinder analysis
			mkdir ${TMP_PATH}/orthofinder

			# Copy the query FASTA file to the orthofinder directory
			cp ${QUERY_FASTA} ${TMP_PATH}/orthofinder/

			# Loop over every FASTA file
			IFS=$'\n'
			for STRAIN_FASTA in $(cat ${STRAINS}); do

				FASTACHECK=$(file ${STRAIN_FASTA} | cut -d " " -f2)

				# Check that the file is not a symbolic link
				if [[ ${FASTACHECK} == "symbolic" ]]; then
					FASTA_REALPATH=$(ls -l ${STRAIN_FASTA} | awk '{ print $11 }')
					STRAIN_FASTA="${FASTA_REALPATH}"
					FASTACHECK=$(file ${STRAIN_FASTA} | cut -d " " -f2)
				fi

				# Extract the prefix of every FASTA file
				STRAIN_PREFIX=$(echo "${STRAIN_FASTA##*/}" | cut -d. -f1)

				# Copy each FASTA file to the orthofinder directory
				if [[ ${FASTACHECK} == "gzip" ]]; then
					zcat < ${STRAIN_FASTA} > ${TMP_PATH}/orthofinder/${STRAIN_PREFIX}.fasta
				else
					cat ${STRAIN_FASTA} > ${TMP_PATH}/orthofinder/${STRAIN_PREFIX}.fasta
				fi
			done

			#Run Orthofinder to obtain a phylogenetic tree
			orthofinder -t ${THREADS} -a ${THREADS} -f ${TMP_PATH}/orthofinder -o ${TMP_PATH}/${NCBITAX}_orthofinder_results > ${TMP_PATH}/orthofinder_log_${NCBITAX}

			# Save the Orthofinder rooted tree and dispose of the orthofinder input files in tmp
			cp ${TMP_PATH}/${NCBITAX}_orthofinder_results/Results_*/Species_Tree/SpeciesTree_rooted.txt ${OUTDIR}/${NCBITAX}_orthofinder_tree.nwk
			INPUT_TREE="${OUTDIR}/${NCBITAX}_orthofinder_tree.nwk"
			echo "--------------------------------------------------"
			echo "The phylogenetic relationships of the genomes can be found in ${INPUT_TREE}"
			rm -fr ${TMP_PATH}/orthofinder/
			echo "The raw OrthoFinder results can be found in ${TMP_PATH}/${NCBITAX}_orthofinder_results"

		fi

		echo "--------------------------------------------------"
		echo "Creating a taxonomy file with the infraspecies-level nodes from the phylogeny"

		TARGET_GENOME=$(echo "${QUERY_FASTA##*/}" | cut -d. -f1)

		tree2ncbitax -i ${INPUT_TREE} -g ${TARGET_GENOME} -t ${NCBITAX} -o ${TMP_PATH}/${TARGET_GENOME}.csv

		# Include the additional proteomes to the analysis

		if [ ! "$CUSTOMDATA" ]; then
			CUSTOMDATA="${TMP_PATH}/customdata.tsv"
		fi

		awk -F "," '{ print $2"\t"$1 }' ${TMP_PATH}/${TARGET_GENOME}.csv > ${TMP_PATH}/tmp_strainIDs.tsv
		while read -r file; do
			STRAIN_PREFIX=$(echo "${file##*/}" | cut -d. -f1)
			STRAIN_NUM=$(grep "^$STRAIN_PREFIX"$'\t' ${TMP_PATH}/tmp_strainIDs.tsv | cut -f 2)
			if [ -n "$STRAIN_NUM" ]; then
				echo -e "${file}\t${STRAIN_NUM}" >> "${CUSTOMDATA}"
			fi
		done < "${STRAINS}"

		echo "The additional genomes will be included in the analysis through the file ${CUSTOMDATA}"
		rm ${TMP_PATH}/tmp_strainIDs.tsv

	fi


	echo "--------------------------------------------------"
	echo "Matching the query genes against themselves"

	# Generate DIAMOND database for self-match
	diamond makedb \
	 --in ${QUERY_FASTA} \
	 --db ${TMP_PATH}/tmp_selfmatch_db --quiet

	# Create a self-match table to find orphan genes and for the clustering analysis
	diamond blastp --${SENSITIVITY} \
	 --query ${QUERY_FASTA} \
	 --db ${TMP_PATH}/tmp_selfmatch_db \
	 --outfmt 6 qseqid sseqid evalue bitscore \
	 --evalue ${EVALUE} --max-target-seqs 0 \
	 --threads ${THREADS} --out ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout --quiet ${DIAMONDOPTS}

	# Create input file for MCL
	cut -f 1,2,3 ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout > ${TMP_PATH}/tmp_${NCBITAX}.abc

	# Add the self-matches to the homology table
	awk -v NCBITAX="$NCBITAX" '{ print $0"\t"NCBITAX }' ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout > ${TMP_PATH}/${NCBITAX}_Diamond_results.bout

	# Remove temporary files
	rm ${TMP_PATH}/tmp_selfmatch_db* ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout

	echo "--------------------------------------------------"
	echo "Searching for homologs against the DIAMOND database"

	# Search for hits between the query species and the DIAMOND database
	diamond blastp --${SENSITIVITY} \
	 --query ${QUERY_FASTA} \
	 --db ${NR_DB} \
	 --outfmt 6 qseqid sseqid evalue bitscore staxids \
	 --evalue ${EVALUE} --max-target-seqs 0 \
	 --threads ${THREADS} --out ${TMP_PATH}/${NCBITAX}_Diamond_prefiltered_results.bout --quiet ${DIAMONDOPTS}

	# Verify that DIAMOND ran correctly, otherwise abort and show error
	if [[ -s ${TMP_PATH}/${NCBITAX}_Diamond_prefiltered_results.bout ]]; then
		 echo "DIAMOND results written!"
	else
		echo
		echo "  ERROR: DIAMOND didn't run properly, verify that the database was built correctly"
		echo "  Exiting"
		exit 1
	fi

	# Remove hits that do not have any associated taxids
	awk '$5!=""' ${TMP_PATH}/${NCBITAX}_Diamond_prefiltered_results.bout >> ${TMP_PATH}/${NCBITAX}_Diamond_results.bout
	rm ${TMP_PATH}/${NCBITAX}_Diamond_prefiltered_results.bout

	# Look for homologs in a custom protein database
	if [ -n "$CUSTOMDATA" ]; then

		if [[ -f ${CUSTOMDATA} ]]; then

			echo "--------------------------------------------------"
			echo "Custom protein database found. Searching for homologs in the custom database using DIAMOND"

			# Generate a list of all gene names with their respective taxid, and generate a custom FASTA file with all custom sequences
			IFS=$'\n'
			for SPECIES in $(cat ${CUSTOMDATA}); do
				SP_FASTA=$(echo $SPECIES | cut -d "	" -f1)
				SP_TAXID=$(echo $SPECIES | cut -d "	" -f2)

				FASTACHECK=$(file ${SP_FASTA} | cut -d " " -f2)

				if [[ ${FASTACHECK} == "symbolic" ]]; then
					FASTA_REALPATH=$(ls -l ${SP_FASTA} | awk '{ print $11 }')
					SP_FASTA="${FASTA_REALPATH}"
					FASTACHECK=$(file ${SP_FASTA} | cut -d " " -f2)
				fi

				if [[ ${FASTACHECK} == "gzip" ]]; then
					zcat < ${SP_FASTA} > ${TMP_PATH}/tmp_${SP_TAXID}_proteins
					${SP_FASTA}="${TMP_PATH}/tmp_${SP_TAXID}_proteins"
				fi

				grep ">" ${SP_FASTA} | awk '{ print $1 }' | sed 's/>//g' | sed "s/$/\t${SP_TAXID}/" >> ${TMP_PATH}/tmp_custom_taxid_table
				cat ${SP_FASTA} >> ${TMP_PATH}/tmp_custom_database.fasta

				if [[ -f ${TMP_PATH}/tmp_${SP_TAXID}_proteins ]]; then
					rm ${TMP_PATH}/tmp_${SP_TAXID}_proteins
				fi
			done

			# Generate DIAMOND custom database
			diamond makedb \
			 --in ${TMP_PATH}/tmp_custom_database.fasta \
			 --db ${TMP_PATH}/tmp_custom_database --quiet

			# Search for hits against the custom database
			diamond blastp --${SENSITIVITY} \
			 --query ${QUERY_FASTA} \
			 --db ${TMP_PATH}/tmp_custom_database \
			 --outfmt 6 qseqid sseqid evalue bitscore \
			 --evalue ${EVALUE} --max-target-seqs 0 \
			 --threads ${THREADS} --out ${TMP_PATH}/${NCBITAX}_custom_DB.bout --quiet ${DIAMONDOPTS}

			# Add the taxids to the results of the custom database and merge them with the nr results
			awk '{ print $2 }' ${TMP_PATH}/${NCBITAX}_custom_DB.bout > ${TMP_PATH}/tmp_sseqids
			awk 'NR==FNR{a[$1]=$2;next}{print $0,a[$1]}' ${TMP_PATH}/tmp_custom_taxid_table ${TMP_PATH}/tmp_sseqids > ${TMP_PATH}/tmp_sseqids_taxids
			awk '{ print $2 }' ${TMP_PATH}/tmp_sseqids_taxids | paste -d "\t" ${TMP_PATH}/${NCBITAX}_custom_DB.bout - | awk '$5!=""' >> ${TMP_PATH}/${NCBITAX}_Diamond_results.bout
			rm ${TMP_PATH}/tmp_custom_taxid_table ${TMP_PATH}/tmp_custom_database* ${TMP_PATH}/tmp_sseqids ${TMP_PATH}/tmp_sseqids_taxids ${TMP_PATH}/${NCBITAX}_custom_DB.bout

		else

			# In case the file does not exist, skip this step
			echo "--------------------------------------------------"
			echo "  ERROR: The user specified a custom protein database, but the following file does not exist:"
			echo "  ${CUSTOMDATA}"
			echo "  genEra will continue without including the custom protein database"

		fi

	fi

	if [ -n "$CUSTOMGENOMES" ]; then

		if [[ -f ${CUSTOMGENOMES} ]]; then

			echo "--------------------------------------------------"
			echo "Custom nucleotide database found. Searching proteins against nucleotides using MMseqs2"

			# Check whether MMseqs2 can be found by genEra (This will not result in exit 1)
			MMSEQSPATH=$(which mmseqs)
			if [[ -z ${MMSEQSPATH} ]]; then
				echo
				echo "  ERROR: genEra could not find mmseqs in the PATH"
				echo "  Please make sure to install MMseqs2 before using a custom nucleotide database"
			fi

			# Generate a list of all the genome contigs with their respective taxid, and generate a custom FASTA file with all the custom genomes
			IFS=$'\n'
			for SPECIES in $(cat ${CUSTOMGENOMES}); do
				SP_FASTA=$(echo $SPECIES | cut -d "	" -f1)
				SP_TAXID=$(echo $SPECIES | cut -d "	" -f2)

				FASTACHECK=$(file ${SP_FASTA} | cut -d " " -f2)

				if [[ ${FASTACHECK} == "symbolic" ]]; then
					FASTA_REALPATH=$(ls -l ${SP_FASTA} | awk '{ print $11 }')
					SP_FASTA="${FASTA_REALPATH}"
					FASTACHECK=$(file ${SP_FASTA} | cut -d " " -f2)
				fi

				if [[ ${FASTACHECK} == "gzip" ]]; then
					zcat < ${SP_FASTA} > ${TMP_PATH}/tmp_${SP_TAXID}_nucleotides
					SP_FASTA="${TMP_PATH}/tmp_${SP_TAXID}_nucleotides"
				fi

				grep ">" ${SP_FASTA} | awk '{ print $1 }' | sed 's/>//g' | sed "s/$/\t${SP_TAXID}/" >> ${TMP_PATH}/tmp_custom_genome_taxid_table
				cat ${SP_FASTA} >> ${TMP_PATH}/tmp_custom_genome_database.fasta

				if [[ -f ${TMP_PATH}/tmp_${SP_TAXID}_nucleotides ]]; then
					rm ${TMP_PATH}/tmp_${SP_TAXID}_nucleotides
				fi
			done

			mkdir ${TMP_PATH}/tmp_mmseqs

			# Create databases for mmseqs
			mmseqs createdb ${QUERY_FASTA} ${TMP_PATH}/tmp_${NCBITAX}_DB --dbtype 1 > mmseqs2_log_${NCBITAX}
			mmseqs createdb ${TMP_PATH}/tmp_custom_genome_database.fasta ${TMP_PATH}/tmp_genome_DB --dbtype 2 >> mmseqs2_log_${NCBITAX}

			# Search the query proteins against the genome assemblies in a "tblastn" fashion
			mmseqs search ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_genome_DB ${TMP_PATH}/tmp_aln ${TMP_PATH}/tmp_mmseqs --threads ${THREADS} -e ${EVALUE} --split-memory-limit 100G -s 7.5 >> mmseqs2_log_${NCBITAX}
			mmseqs convertalis ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_genome_DB ${TMP_PATH}/tmp_aln ${TMP_PATH}/${NCBITAX}_custom_genome_DB.bout --format-output "query,target,evalue,bits" >> mmseqs2_log_${NCBITAX}

			awk '{ print $2 }' ${TMP_PATH}/${NCBITAX}_custom_genome_DB.bout > ${TMP_PATH}/tmp_genome_sseqids
			awk 'NR==FNR{a[$1]=$2;next}{print $0,a[$1]}' ${TMP_PATH}/tmp_custom_genome_taxid_table ${TMP_PATH}/tmp_genome_sseqids > ${TMP_PATH}/tmp_genome_sseqids_taxids
			awk '{ print $2 }' ${TMP_PATH}/tmp_genome_sseqids_taxids | paste -d "\t" ${TMP_PATH}/${NCBITAX}_custom_genome_DB.bout - | awk '$5!=""' >> ${TMP_PATH}/${NCBITAX}_Diamond_results.bout
			rm ${TMP_PATH}/tmp_custom_genome_taxid_table ${TMP_PATH}/tmp_custom_genome_database.fasta ${TMP_PATH}/tmp_${NCBITAX}_DB* ${TMP_PATH}/tmp_genome_DB* ${TMP_PATH}/tmp_aln* ${TMP_PATH}/tmp_genome_sseqids_taxids ${TMP_PATH}/tmp_genome_sseqids ${TMP_PATH}/${NCBITAX}_custom_genome_DB.bout
			rm -r ${TMP_PATH}/tmp_mmseqs/

		else

			echo "--------------------------------------------------"
			echo "  ERROR: The user specified a custom nucleotide database, but the following file does not exist:"
			echo "  ${CUSTOMGENOMES}"
			echo "  genEra will continue without including the custom nucleotide database"

		fi

	fi

	DIAMONDOUT="${TMP_PATH}/${NCBITAX}_Diamond_results.bout"

	# Check that the output file of STEP 1 is not empty
	if [ -s ${DIAMONDOUT} ]; then

		echo "--------------------------------------------------"
		echo "Step 1 finished!"
		echo "The DIAMOND/MMseqs2 table can be found in ${DIAMONDOUT}"
		echo "This file is usually HUGE, please dispose of it if you no longer find it useful"
		echo "It can still be used (-p) in case the user wants to re-run genEra while skipping step 1"

	# Otherwise, display error and exit
	else
		echo
		echo "  ERROR: The DIAMOND/MMseqs2 table is empty. Please check that DIAMOND and/or the target database were correctly installed"
		echo "  Exiting"
		echo
		exit 1
	fi

elif [[ -d $QUERY_PDB ]]; then

	if [[ ! -f ${ALPHAFOLD_DB} ]]; then
		echo
		echo "  ERROR: genEra could not find the locally installed database for Foldseek:"
		echo "  ${NR_DB}"
		echo "  Please refer to the manual for information on how to download such database"
		echo "  Exiting"
		exit 1
	fi

	echo
	echo "STARTING STEP 1: SEARCHING FOR HOMOLOGS WITHIN THE DATABASE USING FOLDSEEK"

	# Check whether Foldseek can be found by genEra
	FOLDSEEKEXEC=$(which foldseek)
	if [[ -z ${FOLDSEEKEXEC} ]]; then
		echo
		echo "  ERROR: genEra could not find the foldseek excecutable. Unable to run step 1"
		echo "  Please make sure that Foldseek is properly installed and located in the PATH"
		echo "  Exiting"
		exit 1
	fi

	echo "--------------------------------------------------"
	echo "Matching the query genes against themselves"

	# Generate Foldseek database for the query proteins
	foldseek createdb ${QUERY_PDB} ${TMP_PATH}/tmp_${NCBITAX}_DB -v 0

	# Create a self-match table to find orphan genes and for the clustering analysis
	foldseek search ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_${NCBITAX}_aln ${TMP_PATH}/tmp_${NCBITAX}_foldseek -e ${EVALUE} --max-seqs ${FOLDSEEK_MAXSEQS} --threads ${THREADS} -v 0 ${DIAMONDOPTS}
	foldseek convertalis ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_${NCBITAX}_aln ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout --format-output "query,target,evalue,bits" -v 0

	# Create input file for MCL
	cut -f 1,2,3 ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout > ${TMP_PATH}/tmp_${NCBITAX}.abc

	# Add the self-matches to the homology table
	awk -v NCBITAX="$NCBITAX" '{ print $0"\t"NCBITAX }' ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout > ${TMP_PATH}/${NCBITAX}_Foldseek_results.bout

	# Remove temporary files
	rm ${TMP_PATH}/tmp_${NCBITAX}_aln* ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout

	echo "--------------------------------------------------"
	echo "Searching for homologs against the Foldseek database"

	# Search for hits between the query species and the Foldseek database (e.g., AlphaFold UniProt)
	foldseek search ${TMP_PATH}/tmp_${NCBITAX}_DB ${ALPHAFOLD_DB} ${TMP_PATH}/tmp_${NCBITAX}_aln ${TMP_PATH}/tmp_${NCBITAX}_foldseek -e ${EVALUE} --max-seqs ${FOLDSEEK_MAXSEQS} --threads ${THREADS} -v 0 ${DIAMONDOPTS}

	# Generate table format and remove hits that do not have any associated taxids
	foldseek convertalis ${TMP_PATH}/tmp_${NCBITAX}_DB ${ALPHAFOLD_DB} ${TMP_PATH}/tmp_${NCBITAX}_aln ${TMP_PATH}/${NCBITAX}_Foldseek_prefiltered_results.bout --format-output "query,target,evalue,bits,taxid" -v 0
	awk '$5!=""' ${TMP_PATH}/${NCBITAX}_Foldseek_prefiltered_results.bout >> ${TMP_PATH}/${NCBITAX}_Foldseek_results.bout

	# Verify that Foldseek ran correctly, otherwise abort and show error
	if [[ -s ${TMP_PATH}/${NCBITAX}_Foldseek_prefiltered_results.bout ]]; then
		echo "Foldseek results written!"
	else
		echo "  ERROR: Foldseek didn't run properly, verify that the database was built correctly"
		echo "  Verify that Foldseek didn't run out of RAM"
		echo "  Try increasing the amount of RAM or lower the value of prefilter hits (-M)"
		echo "  Alternatively, use DIAMOND"
		echo "  Exiting"
		exit 1
	fi
	rm ${TMP_PATH}/${NCBITAX}_Foldseek_prefiltered_results.bout ${TMP_PATH}/tmp_${NCBITAX}_aln*

	# Look for homologs in a custom PDB dataset
	if [ -n "$CUSTOM_PDB" ]; then

		if [[ -f ${CUSTOM_PDB} ]]; then

			echo "--------------------------------------------------"
			echo "Custom PDB database found. Searching for homologs in the custom database using Foldseek"
			echo "This will require considerable storage space in ${TMP_PATH}"

			# Generate a list of all gene names with their respective taxid, and generate a custom PDB directory with all the custom structural predictions
			mkdir ${TMP_PATH}/tmp_custom_PDB_DB
			WORKINGDIR=$(pwd)
			IFS=$'\n'
			for SPECIES in $(cat ${CUSTOM_PDB}); do
				SP_PDB=$(echo $SPECIES | cut -d "	" -f1)
				SP_TAXID=$(echo $SPECIES | cut -d "	" -f2)

				cd ${SP_PDB}
				ls | sed "s/$/\t${SP_TAXID}/" >> ${TMP_PATH}/tmp_PDB_taxid_table
				cp * ${TMP_PATH}/tmp_custom_PDB_DB/
				cd ${WORKINGDIR}

			done

			# Generate Foldseek custom database
			foldseek createdb ${TMP_PATH}/tmp_custom_PDB_DB ${TMP_PATH}/tmp_custom_foldseek -v 0
			rm -r ${TMP_PATH}/tmp_custom_PDB_DB/

			# Search for hits against the custom database
			foldseek search ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_custom_foldseek ${TMP_PATH}/tmp_custom_aln ${TMP_PATH}/tmp_${NCBITAX}_foldseek -e ${EVALUE} --max-seqs ${FOLDSEEK_MAXSEQS} --threads ${THREADS} -v 0 ${DIAMONDOPTS}
			foldseek convertalis ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_custom_foldseek ${TMP_PATH}/tmp_custom_aln ${TMP_PATH}/tmp_${NCBITAX}_custom_matches.bout --format-output "query,target,evalue,bits" -v 0

			# Add the taxids to the results of the custom database and merge them with the AlphaFoldDB results
			awk '{ print $2 }' ${TMP_PATH}/tmp_${NCBITAX}_custom_matches.bout > ${TMP_PATH}/tmp_PDBids
			awk 'NR==FNR{a[$1]=$2;next}{print $0,a[$1]}' ${TMP_PATH}/tmp_PDB_taxid_table ${TMP_PATH}/tmp_PDBids > ${TMP_PATH}/tmp_PDBids_taxids
			awk '{ print $2 }' ${TMP_PATH}/tmp_PDBids_taxids | paste -d "\t" ${TMP_PATH}/tmp_${NCBITAX}_custom_matches.bout - | awk '$5!=""' >> ${TMP_PATH}/${NCBITAX}_Foldseek_results.bout
			rm ${TMP_PATH}/tmp_PDB_taxid_table ${TMP_PATH}/tmp_custom_foldseek* ${TMP_PATH}/tmp_PDBids ${TMP_PATH}/tmp_PDBids_taxids ${TMP_PATH}/tmp_${NCBITAX}_custom_matches.bout ${TMP_PATH}/tmp_custom_aln*

		else

			# In case the file does not exist, skip this step
			echo "--------------------------------------------------"
			echo "  ERROR: The user specified a custom PDB database, but the following file does not exist:"
			echo "  ${CUSTOM_PDB}"
			echo "  genEra will continue without including the custom PDB database"

		fi

	fi

	rm ${TMP_PATH}/tmp_${NCBITAX}_DB*
	rm -r ${TMP_PATH}/tmp_${NCBITAX}_foldseek/
	DIAMONDOUT="${TMP_PATH}/${NCBITAX}_Foldseek_results.bout"

	# Check that the output file of STEP 1 is not empty
	if [ -s ${DIAMONDOUT} ]; then

		echo "--------------------------------------------------"
		echo "Step 1 finished!"
		echo "The Foldseek table can be found in ${DIAMONDOUT}"
		echo "This file is usually QUITE BIG, please dispose of it if you no longer find it useful"
		echo "It can still be used (-p) in case the user wants to re-run genEra while skipping step 1"

	# Otherwise, display error and exit
	else
		echo
		echo "  ERROR: The Foldseek table is empty. Please check that foldseek and/or the target database were correctly installed"
		echo "  Exiting"
		echo
		exit 1
	fi

fi

#######################################################
###                 GenEra: STEP 2                  ###
#######################################################

# Generate a taxonomic database based on the daxdump from the NCBI and the output of ncbitax2lin
if [ -n "$CUSTOMTAX" ]; then

	if [ -s ${CUSTOMTAX} ]; then

		echo
		echo "THE SPECIES-TAILORED TAXONOMIC DATABASE WAS PROVIDED BY THE USER. SKIPPING STEP 2"

	else
		echo
		echo "  ERROR: The species-tailored file provided by the user is empty"
		echo "  Exiting"
		echo
		exit 1
	fi

else

	echo
	echo "STARTING STEP 2: GENERATING TAXONOMIC DATABASE FOR THE PHYLOSTRATIGRAPHIC ASSIGNMENT OF YOUR GENES"

	if [ -n "$RAWTAX" ]; then
		if [ -s ${RAWTAX} ]; then

			echo "--------------------------------------------------"
			echo "Using the raw \"ncbi_lineages\" file provided by the user. Skiping ncbitax2lin"

		else
			echo
			echo "  ERROR: The raw \"ncbi_lineages\" file provided by the user is empty"
			echo "  Exiting"
			echo
			exit 1
		fi

	else
		# Check whether ncbitax2lin exists in the PATH
		TAX2LINPATH=$(which ncbitax2lin)
		if [[ -z ${TAX2LINPATH} ]]; then
			echo
			echo "  ERROR: The user attempted to generate the raw \"ncbi_lineages\" file from the NCBI taxdump, but genEra could not find the ncbitax2lin excecutable. Unable to run step 2"
			echo "  Please ensure that ncbitax2lin is correctly installed in your PATH and try again"
			echo "  Or try to use an already existing \"ncbi_lineages\" file with argument -r"
			echo "  The user can resume genEra from this step by including the DIAMOND/MMseqs2 file ${DIAMONDOUT} in the pipeline (-p)"
			exit 1
		fi


		echo "--------------------------------------------------"
		echo "Running ncbitax2lin to generate a raw \"ncbi_lineages\" file from the NCBI taxdump"

		# Verify that the taxdump contains the necessary files to run ncbitax2lin
		if [[ ! -f ${TAXDUMP}/nodes.dmp ]] || [[ ! -f ${TAXDUMP}/names.dmp ]]; then
			echo
			echo "  ERROR: genEra could not find \"nodes.dmp\" and/or \"names.dmp\" within the specified taxdump directory:"
			echo "  ${TAXDUMP}/"
			echo "  Please ensure that the route to the daxdump directory is correct, and that both files exist within that directory"
			echo "  Or try to use an already existing \"ncbi_lineages\" file with argument -r"
			echo "  The user can resume genEra from this step by including the DIAMOND/MMseqs2 file ${DIAMONDOUT} in the pipeline (-p)"
			exit 1
		fi

		# Generate the raw ncbi_lineages file with ncbitax2lin
		ncbitax2lin --nodes-file ${TAXDUMP}/nodes.dmp --names-file ${TAXDUMP}/names.dmp
		gunzip ncbi_lineages_*.csv.gz
		mv ncbi_lineages_*.csv ${OUTDIR}/
		RAWTAX=$(ls ${OUTDIR}/ncbi_lineages_*.csv)

		echo "--------------------------------------------------"
		echo "Finished generating a raw \"ncbi_lineages\" file named ${RAWTAX}"
		echo "Keep it in case you want to run genEra with another species (-r)"

	fi

	# Verify that the raw ncbi_lineages file exists
	if [[ ! -f ${RAWTAX} ]]; then
		echo
		echo "  ERROR: genEra could not find the raw \"ncbi_lineages\" file:"
		echo "  ${RAWTAX}"
		echo "  If step 1 ran succesfully, the user can resume from this step using the argument -p"
		echo "  Exiting"
		exit 1
	fi

	# Verify that the raw ncbi_lineages file is not empty
	if [ -s ${RAWTAX} ]; then
		echo "--------------------------------------------------"
		echo "Rearranging the raw \"ncbi_lineages\" file by taxonomic hierarchy"
	else
		echo
		echo "  ERROR: The raw \"ncbi_lineages\" file is empty"
		echo "  Check that ncbitax2lin was correctly installed. Otherwise, you can download a pre-generated raw \"ncbi_lineages\" file from GenEra's github and use ir with the argument -r"
		echo "  Exiting"
		echo
		exit 1
	fi

	# Extract the line in the "ncbi_lineages" file corresponding to the query organism
	QPHYLOS=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${RAWTAX})

	# Download the lineage hierarchy of the query species from the NCBI webpage (in case fullnamelineage.dmp is not found)
	if [[ ! -f ${TAXDUMP}/fullnamelineage.dmp ]]; then
		echo "  WARNING: genEra could not find the file fullnamelineage.dmp within the NCBI taxdump"
		echo "  Please make sure you download the latest version of the NCBI Taxonomy dump: ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz"
		wget -q "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=${NCBITAX}&lvl=3&p=has_linkout&p=blast_url&p=genome_blast&lin=f&keep=1&srchmode=1&unlock"
	fi

	if [[ -f ${TAXDUMP}/fullnamelineage.dmp ]]; then

		echo "Extracting lineage information from ${TAXDUMP}/fullnamelineage.dmp"
		LINEAGE_ARRAY=()
		IFS=$'\n'
		for LINEAGE in $(awk -F "|" -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $3 }' ${TAXDUMP}/fullnamelineage.dmp | sed 's/\t//g' | sed 's/; /\n/g' | sed '/^[[:space:]]*$/d' | grep -v "incertae sedis"); do
			LINEAGE_ARRAY+=($(echo ${QPHYLOS} | sed -n $'1s/,/\\\n/gp' | grep -nx "$LINEAGE" | cut -d: -f1 && echo))
		done
		TOP_LEVEL=$(awk -F "|" -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $2 }' ${TAXDUMP}/fullnamelineage.dmp | sed 's/\t//g')
		LINEAGE_ARRAY+=($(echo ${QPHYLOS} | sed -n $'1s/,/\\\n/gp' | grep -nx "$TOP_LEVEL" | cut -d: -f1 && echo))

		# Extract the columns in "ncbi_lineages" in the correct hierarchical order
		touch ${TMP_PATH}/tmp_column_0
		PRINTCOLUMN="${TMP_PATH}/tmp_column_0"

		for LINEAGECOL in "${LINEAGE_ARRAY[@]}"; do
			awk -F'"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(",", "", $i) } 1' ${RAWTAX} | awk -F "," -v LINEAGECOL="$LINEAGECOL" '{ print $LINEAGECOL }' | paste -d , - ${PRINTCOLUMN} > ${TMP_PATH}/tmp_column_${LINEAGECOL}
			PRINTCOLUMN="${TMP_PATH}/tmp_column_${LINEAGECOL}"
		done
		awk -F'"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(",", "", $i) } 1' ${RAWTAX} | awk -F "," '{ print $1 }' | paste -d , - ${PRINTCOLUMN} | sed 's/,$//' > ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages

		rm ${TMP_PATH}/tmp_column_*

		# Extract the philostrata corresponding to the query organism
		QPHYLOS=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages)

		# Extract all the lineages proteins that matched the query sequences under a certain threshold and that belong to the same oldest lineage of the query species (e.g., "cellular organisms")
		if [[ -f $QUERY_FASTA ]]; then
			GENENUM=$(grep -c ">" ${QUERY_FASTA})
		elif [[ -d $QUERY_PDB ]]; then
			GENENUM=$(ls ${QUERY_PDB} | wc -l)
		fi

		OLDEST=$(echo $QPHYLOS | awk -F "," '{ print $NF }')

		echo "--------------------------------------------------"
		echo "Extracting all the lineages that match more than ${MINHITS} percent of your query proteins"

		HITPERC=$(( ${GENENUM}*${MINHITS}/100 ))
		awk '{ print $NF }' ${DIAMONDOUT} | sort -n -T ${TMP_PATH} | uniq -c | awk -v HITPERC="$HITPERC" '{ if ($1 >= HITPERC) print $2 }' | sed '/^[[:space:]]*$/d' | awk -F "," 'FNR==NR{ a[$1]=$0;next } ($1 in a)' - ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages | grep "$OLDEST" > ${TMP_PATH}/tmp_${NCBITAX}_matches

		# Finally, tailor the taxonomic database for the query species with the phylostrata that are useful for the analysis
		COUNTER=2

		echo "--------------------------------------------------"
		echo "Collapsing the phylostrata that are not represented in your DIAMOND results"
		cat ${TMP_PATH}/tmp_${NCBITAX}_matches > ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches
		ARRAY=()
		COLLAPSEDSTRATA=()

		IFS=$'\n'
		for STRATUM in $(echo $QPHYLOS | sed "s/,/\n/g" | tail -n +2); do

			MATCHINGTAXA=$(awk -F "," -v COUNTER="$COUNTER" -v STRATUM="$STRATUM" '{ if ($COUNTER == STRATUM) print $0 }' ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches | wc -l)
			if [ "$MATCHINGTAXA" -ge 1 ]; then
				awk -F "," -v COUNTER="$COUNTER" -v STRATUM="$STRATUM" '{ if ($COUNTER != STRATUM) print $0 }' ${TMP_PATH}/tmp_${NCBITAX}_matches > ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches
				ARRAY+=$COUNTER,
			else
				COLLAPSEDSTRATA+="[${STRATUM}] "
			fi
			let COUNTER+=1

		done

		if [ ${#COLLAPSEDSTRATA[@]} -gt 0 ]; then

			echo "--------------------------------------------------"
			echo "  WARNING: The following phylostrata were collapsed due to lack of sufficient genomic data:"
			echo "  $COLLAPSEDSTRATA"
			echo "  If you want to include them in your analysis, please add the necessary taxa in a custom database (-a or -f), making sure their last common ancestor to the query species can be assigned to that specific taxonomic level in the NCBI taxonomy database"

		fi

		echo "--------------------------------------------------"
		echo "Generating the species-tailored database"

		awk -F "," '{ print $1 }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages > ${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv
		PRINTCOLUMN="${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv"

		for LEVEL in $(echo $ARRAY | sed "s/,/\n/g" | sed '/^[[:space:]]*$/d'); do
			awk -F "," -v LEVEL="$LEVEL" '{ print $LEVEL }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages | paste -d , ${PRINTCOLUMN} - > ${TMP_PATH}/tmp_column_${LEVEL}
			PRINTCOLUMN="${TMP_PATH}/tmp_column_${LEVEL}"
		done

		cat ${PRINTCOLUMN} > ${OUTDIR}/${NCBITAX}_ncbi_lineages.csv

		rm ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages ${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches ${TMP_PATH}/tmp_${NCBITAX}_matches ${TMP_PATH}/tmp_column_*

		CUSTOMTAX="${OUTDIR}/${NCBITAX}_ncbi_lineages.csv"

	elif [[ -f wwwtax.cgi\?mode\=Info\&id\=${NCBITAX}\&lvl\=3\&p\=has_linkout\&p\=blast_url\&p\=genome_blast\&lin\=f\&keep\=1\&srchmode\=1\&unlock ]]; then

		echo "Extracting lineage information from the NCBI webpage"

		# Create an array which contains the column numbers corresponding to the phylostrata of the query species
		LINEAGE_ARRAY=()
		IFS=$'\n'
		for LINEAGE in $(grep "ALT=\"no rank" wwwtax.cgi\?mode\=Info\&id\=${NCBITAX}\&lvl\=3\&p\=has_linkout\&p\=blast_url\&p\=genome_blast\&lin\=f\&keep\=1\&srchmode\=1\&unlock | sed 's/>/\n/g' | grep "</a" | sed 's/<\/a//g' | grep -v "incertae sedis"); do
			LINEAGE_ARRAY+=($(echo ${QPHYLOS} | sed -n $'1s/,/\\\n/gp' | grep -nx "$LINEAGE" | cut -d: -f1 && echo))
		done
		TOP_LEVEL=$(grep "<title>Taxonomy browser" wwwtax.cgi\?mode\=Info\&id\=${NCBITAX}\&lvl\=3\&p\=has_linkout\&p\=blast_url\&p\=genome_blast\&lin\=f\&keep\=1\&srchmode\=1\&unlock | sed 's/  <title>Taxonomy browser (//g' | sed 's/)<\/title>//g')
		LINEAGE_ARRAY+=($(echo ${QPHYLOS} | sed -n $'1s/,/\\\n/gp' | grep -nx "$TOP_LEVEL" | cut -d: -f1 && echo))

		# Extract the columns in "ncbi_lineages" in the correct hierarchical order
		touch ${TMP_PATH}/tmp_column_0
		PRINTCOLUMN="${TMP_PATH}/tmp_column_0"

		for LINEAGECOL in "${LINEAGE_ARRAY[@]}"; do
			awk -F'"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(",", "", $i) } 1' ${RAWTAX} | awk -F "," -v LINEAGECOL="$LINEAGECOL" '{ print $LINEAGECOL }' | paste -d , - ${PRINTCOLUMN} > ${TMP_PATH}/tmp_column_${LINEAGECOL}
			PRINTCOLUMN="${TMP_PATH}/tmp_column_${LINEAGECOL}"
		done
		awk -F'"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(",", "", $i) } 1' ${RAWTAX} | awk -F "," '{ print $1 }' | paste -d , - ${PRINTCOLUMN} | sed 's/,$//' > ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages

		rm wwwtax* ${TMP_PATH}/tmp_column_*

		# Extract the philostrata corresponding to the query organism
		QPHYLOS=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages)

		# Extract all the lineages proteins that matched the query sequences under a certain threshold and that belong to the same oldest lineage of the query species (e.g., "cellular organisms")
		if [[ -f $QUERY_FASTA ]]; then
			GENENUM=$(grep -c ">" ${QUERY_FASTA})
		elif [[ -d $QUERY_PDB ]]; then
			GENENUM=$(ls ${QUERY_PDB} | wc -l)
		fi

		OLDEST=$(echo $QPHYLOS | awk -F "," '{ print $NF }')

		echo "--------------------------------------------------"
		echo "Extracting all the lineages that match more than ${MINHITS} percent of your query proteins"

		HITPERC=$(( ${GENENUM}*${MINHITS}/100 ))
		awk '{ print $NF }' ${DIAMONDOUT} | sort -n -T ${TMP_PATH} | uniq -c | awk -v HITPERC="$HITPERC" '{ if ($1 >= HITPERC) print $2 }' | sed '/^[[:space:]]*$/d' | awk -F "," 'FNR==NR{ a[$1]=$0;next } ($1 in a)' - ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages | grep "$OLDEST" > ${TMP_PATH}/tmp_${NCBITAX}_matches

		# Finally, tailor the taxonomic database for the query species with the phylostrata that are useful for the analysis
		COUNTER=2

		echo "--------------------------------------------------"
		echo "Collapsing the phylostrata that are not represented in your DIAMOND results"
		cat ${TMP_PATH}/tmp_${NCBITAX}_matches > ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches
		ARRAY=()
		COLLAPSEDSTRATA=()

		IFS=$'\n'
		for STRATUM in $(echo $QPHYLOS | sed "s/,/\n/g" | tail -n +2); do

			MATCHINGTAXA=$(awk -F "," -v COUNTER="$COUNTER" -v STRATUM="$STRATUM" '{ if ($COUNTER == STRATUM) print $0 }' ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches | wc -l)
			if [ "$MATCHINGTAXA" -ge 1 ]; then
				awk -F "," -v COUNTER="$COUNTER" -v STRATUM="$STRATUM" '{ if ($COUNTER != STRATUM) print $0 }' ${TMP_PATH}/tmp_${NCBITAX}_matches > ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches
				ARRAY+=$COUNTER,
			else
				COLLAPSEDSTRATA+="[${STRATUM}] "
			fi
			let COUNTER+=1

		done

		if [ ${#COLLAPSEDSTRATA[@]} -gt 0 ]; then

			echo "--------------------------------------------------"
			echo "  WARNING: The following phylostrata were collapsed due to lack of sufficient genomic data:"
			echo "  $COLLAPSEDSTRATA"
			echo "  If you want to include them in your analysis, please add the necessary taxa in a custom database (-a or -f), making sure their last common ancestor to the query species can be assigned to that specific taxonomic level in the NCBI taxonomy database"

		fi

		echo "--------------------------------------------------"
		echo "Generating the species-tailored database"

		awk -F "," '{ print $1 }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages > ${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv
		PRINTCOLUMN="${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv"

		for LEVEL in $(echo $ARRAY | sed "s/,/\n/g" | sed '/^[[:space:]]*$/d'); do
			awk -F "," -v LEVEL="$LEVEL" '{ print $LEVEL }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages | paste -d , ${PRINTCOLUMN} - > ${TMP_PATH}/tmp_column_${LEVEL}
			PRINTCOLUMN="${TMP_PATH}/tmp_column_${LEVEL}"
		done

		cat ${PRINTCOLUMN} > ${OUTDIR}/${NCBITAX}_ncbi_lineages.csv

		rm ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages ${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches ${TMP_PATH}/tmp_${NCBITAX}_matches ${TMP_PATH}/tmp_column_*

		CUSTOMTAX="${OUTDIR}/${NCBITAX}_ncbi_lineages.csv"

	else

		echo "--------------------------------------------------"
		echo "  ERROR: genEra was unable to retrieve the lineage information of your query species from the local NCBI taxdump or from the NCBI webpage"
		echo "  genEra will attempt to organize the phylostrata in the correct taxonomic hierarchy"
		echo "  This may result in the loss of some phylostrata from the species-tailored database"

		# Eliminate the inner commas from some of the taxa in the raw file and arrange the lineages by taxonomic hierarchies
		awk -F'"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(",", "", $i) } 1' ${RAWTAX} | awk -F "," '{ print $1","$8","$52","$51","$57","$7","$63","$68","$56","$6","$65","$45","$36","$59","$5","$66","$55","$30","$35","$54","$4","$64","$60","$3","$67","$58","$38","$2","$40 }' > ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages

		# Extract the philostrata corresponding to the query organism
		QPHYLOS=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages)

		# Extract all the lineages proteins that matched the query sequences under a certain threshold and that belong to the same oldest lineage of the query species (e.g., "cellular organisms")
		if [[ -f $QUERY_FASTA ]]; then
			GENENUM=$(grep -c ">" ${QUERY_FASTA})
		elif [[ -d $QUERY_PDB ]]; then
			GENENUM=$(ls ${QUERY_PDB} | wc -l)
		fi

		OLDEST=$(echo $QPHYLOS | awk -F "," '{ print $NF }')

		HITPERC=$(( ${GENENUM}*${MINHITS}/100 ))
		awk '{ print $NF }' ${DIAMONDOUT} | sort -n -T ${TMP_PATH} | uniq -c | awk -v HITPERC="$HITPERC" '{ if ($1 >= HITPERC) print $2 }' | sed '/^[[:space:]]*$/d' | awk -F "," 'FNR==NR{ a[$1]=$0;next } ($1 in a)' - ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages | grep "$OLDEST" > ${TMP_PATH}/tmp_${NCBITAX}_matches

		#Evaluate the existance of unassignable clades in the database
		MISSINGSTRATA=()
		awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $10","$11","$12","$13","$14","$15","$16","$17","$18","$19","$20","$21","$22","$23","$24","$25","$26","$27","$28","$29","$41","$42","$43","$44 }' ${RAWTAX} > ${TMP_PATH}/tmp_missingtaxa_${NCBITAX}

		IFS=$'\n'
		for MISSING in $(sed "s/,/\n/g" ${TMP_PATH}/tmp_missingtaxa_${NCBITAX}); do

			if [ -n "$MISSING" ]; then
				MISSINGSTRATA+="[${MISSING}] "
			fi
		done

		if [ ${#MISSINGSTRATA[@]} -gt 0 ]; then

			echo "--------------------------------------------------"
			echo "  WARNING: genEra was unable to automatically include the following phylostrata in the analysis, as the NCBI database did not assigned them to a concrete taxonomic level:"
			echo "  $MISSINGSTRATA"
			echo "  If you want to include them in your analysis, make sure that genEra runs in a machine with internet access"
			echo "  Alternatively, the user can generate a custom table (-c) directly from the raw \"ncbi_lineages\" file, using the following command:"
			echo "  awk -F'\"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(\",\", \"\", \$i) } 1' ncbi_lineages_[date].csv | awk -F \",\" '{ print \$1\",\"\$8\",\"\$7\",\"\$6 ... }' > custom_table.csv"

		fi

		# Finally, tailor the taxonomic database for the query species with the phylostrata that are useful for the analysis
		COUNTER=2

		echo "--------------------------------------------------"
		echo "Collapsing the phylostrata that are not represented in your DIAMOND results"
		cat ${TMP_PATH}/tmp_${NCBITAX}_matches > ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches
		ARRAY=()
		COLLAPSEDSTRATA=()

		IFS=$'\n'
		for STRATUM in $(echo $QPHYLOS | sed 's/,,/,NA,/g' | sed 's/,,/,NA,/g' | sed "s/,/\n/g" | tail -n +2); do

			if [ $STRATUM != "NA" ]; then

				 MATCHINGTAXA=$(awk -F "," -v COUNTER="$COUNTER" -v STRATUM="$STRATUM" '{ if ($COUNTER == STRATUM) print $0 }' ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches | wc -l)
				if [ "$MATCHINGTAXA" -ge 1 ]; then
					awk -F "," -v COUNTER="$COUNTER" -v STRATUM="$STRATUM" '{ if ($COUNTER != STRATUM) print $0 }' ${TMP_PATH}/tmp_${NCBITAX}_matches > ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches
					ARRAY+=$COUNTER,
				else
					COLLAPSEDSTRATA+="[${STRATUM}] "
				fi
			fi
			let COUNTER+=1

		done

		if [ ${#COLLAPSEDSTRATA[@]} -gt 0 ]; then

			echo "--------------------------------------------------"
			echo "  WARNING: The following phylostrata were collapsed due to lack of sufficient genomic data"
			echo "  $COLLAPSEDSTRATA"
			echo "  If you want to include them in your analysis, please add the necessary taxa in a custom database (-a or -f), making sure their last common ancestor to the query species can be assigned to that specific taxonomic level in the NCBI taxonomy database"

		fi

		echo "--------------------------------------------------"
		echo "Generating the species-tailored database"

		awk -F "," '{ print $1 }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages > ${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv
		PRINTCOLUMN="${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv"

		for LEVEL in $(echo $ARRAY | sed "s/,/\n/g" | sed '/^[[:space:]]*$/d'); do

			awk -F "," -v LEVEL="$LEVEL" '{ print $LEVEL }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages | paste -d , ${PRINTCOLUMN} - > ${TMP_PATH}/tmp_column_${LEVEL}
			PRINTCOLUMN="${TMP_PATH}/tmp_column_${LEVEL}"
		done

		cat ${PRINTCOLUMN} > ${OUTDIR}/${NCBITAX}_ncbi_lineages.csv

		rm ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages ${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches ${TMP_PATH}/tmp_${NCBITAX}_matches ${TMP_PATH}/tmp_column_* ${TMP_PATH}/tmp_missingtaxa_${NCBITAX}

		CUSTOMTAX="${OUTDIR}/${NCBITAX}_ncbi_lineages.csv"

	fi

	# Add the infraspecies-level nodes to the taxonomic database
	if [[ -f ${STRAINS} ]]; then

		echo "--------------------------------------------------"
		echo "Adding the infraspecies nodes to the species-tailored database"

		# Extract the taxonomic levels of the query species from the NCBI Taxonomy, minus the NCBI Taxonomy ID
		QPHYLOS_SUFFIX=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${CUSTOMTAX} | awk -F "," 'BEGIN{OFS=","} {$1="";print}')

		# Calculate the number of columns in the strains table to generate a string of N commas
		NUM_COLUMNS=$(awk -F',' 'NR==1{print NF}' ${TMP_PATH}/${TARGET_GENOME}.csv)
		COMMAS=$(printf ',%.0s' $(seq 1 $NUM_COLUMNS))

		# Generate a string with the original lineage line for que query species for removal
		QPHYLOS=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${CUSTOMTAX})

		# Remove the line containing the lineage info for the query species and add N commas between column 1 and column 2 of the NCBI taxonomy table
		grep -v "$QPHYLOS" ${CUSTOMTAX} | sed "s/\([^,]*\),\([^,]*\)/\1$COMMAS\2/" > ${TMP_PATH}/tmp_${NCBITAX}_added_columns

		# Add the missing taxonomic levels to the infraspecies-level genomes and incorporate these to the NCBI taxonomy table
		awk -F "," -v QPHYLOS_SUFFIX="$QPHYLOS_SUFFIX" '{ print $0 QPHYLOS_SUFFIX }' ${TMP_PATH}/${TARGET_GENOME}.csv >> ${TMP_PATH}/tmp_${NCBITAX}_added_columns

		# Replace the species-tailored taxonomy table with the additional infraspecies-level nodes
		mv ${TMP_PATH}/tmp_${NCBITAX}_added_columns ${CUSTOMTAX}

		# Remove the "${TARGET_GENOME}.csv" file from tmp
		rm ${TMP_PATH}/${TARGET_GENOME}.csv

	fi

	if [ -s ${CUSTOMTAX} ]; then

		echo "--------------------------------------------------"
		echo "Step 2 finished!"
		echo "your species-tailored database can be found in ${CUSTOMTAX}"
		echo "Keep this file in case you want to re-run step 3 of genEra with the same species (-c)"

	else
		echo
		echo "  ERROR: The species-tailored database is empty! please send me an email to figure out what might be the issue (josue.barrera@tuebingen.mpg.de)"
		echo "  Exiting"
		echo
		exit 1
	fi

fi

if [[ ! -f ${CUSTOMTAX} ]]; then
	echo
	echo "  ERROR: genEra could not find the species-tailored \"ncbi_lineages\" file:"
	echo "  ${CUSTOMTAX}"
	echo "  Please make sure that this file exists, otherwise run step 2 of genEra (-d or -r)"
	echo "  If step 1 ran succesfully, the user can resume from this step using argument -p"
	echo "  Exiting"
	exit 1
fi

#######################################################
###                 GenEra: STEP 3                  ###
#######################################################

# Start the gene age assignment
echo
echo "STARTING STEP 3: ASSIGNING AGES TO YOUR QUERY GENES WITH Erassignment"

# Generate temporal list with all query proteins from step 1
if [[ -f $QUERY_FASTA ]]; then
	grep ">" ${QUERY_FASTA} | awk '{ print $1 }' | sed 's/>//g' > ${TMP_PATH}/tmp_gene_list
elif [[ -d $QUERY_PDB ]]; then
	ls ${QUERY_PDB} > ${TMP_PATH}/tmp_gene_list
fi

if [[ -f ${DIVERGENCE} ]]; then

	# Calculate the size of the genes in the query FASTA or PDB files
	if [[ -f $QUERY_FASTA ]]; then
		echo -e "GeneID\tGeneLength" > ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths
		awk '/^>/{if (l!="") print l; print; l=0; next}{l+=length($0)}END{print l}' ${QUERY_FASTA} | awk '{ print $1 }' | sed '$!N;s/\n/\t/g' | sed 's/>//g' >> ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths
	elif [[ -d $QUERY_PDB ]]; then
		echo -e "GeneID\tGeneLength" > ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths
		for PDBFILE in $(ls ${QUERY_PDB}); do
			PROT_SIZE=$(awk '{ if ($1 == "TER") print $4$5 }' ${QUERY_PDB}/${PDBFILE} | sed 's/A//g')
			echo -e "${PDBFILE}\t${PROT_SIZE}" >> ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths
		done
	fi

	# Create header of the bitscore file for abSENSE
	awk  '{ printf( "%s ", $1 ); } END { printf( "\n" ); }' ${DIVERGENCE} | sed 's/[[:space:]]*$//' | sed 's/ /\t/g' | sed -e 's/^/None\t/' > ${OUTDIR}/${NCBITAX}_bitscores.tsv

fi


if [ $THREADS -gt 1 ]; then

	if [[ "${FAST_STEP3}" = "true" ]]; then
		
		# Split ${DIAMONDOUT} into N files with 848068583 lines due to R number of lines limit | This can be further tested for improving RAM usage

		split -l 848068583 ${DIAMONDOUT} -d "${TMP_PATH}/Diamond_F3R_"

		# Split ${DIAMONDOUT} results per gene and store in ${TMP_PATH}

		echo "--------------------------------------------------"
		echo "Splitting results per query gene using ${THREADS} threads"

		Rscript --vanilla ${F3R} -t ${TMP_PATH} -c ${THREADS}

		rm ${TMP_PATH}/Diamond_F3R_*

	fi

	# Split the gene list in accordance with the number of threads
	SPLIT_NUM=$(wc -l ${TMP_PATH}/tmp_gene_list | cut -d' ' -f1 | while read number; do  echo $((number / $THREADS +1)); done)
	split -l $SPLIT_NUM ${TMP_PATH}/tmp_gene_list ${TMP_PATH}/tmp_gene_sublist_

	# Create modified copies of Erassignment, according to the number of threads
	MULTITHREAD_COUNTER=1
	for SUBLIST in $(ls ${TMP_PATH} | grep "tmp_gene_sublist_"); do
		sed "s/tmp_gene_list/$SUBLIST/g" ${PHYLOSCRIPT} | sed "s/tmp_phylostrata/tmp_phylostrata_$MULTITHREAD_COUNTER/g" | sed "s/tmp_sub_phylostrata/tmp_sub_phylostrata_$MULTITHREAD_COUNTER/g" > ${TMP_PATH}/Erassignment_$MULTITHREAD_COUNTER && chmod +x ${TMP_PATH}/Erassignment_$MULTITHREAD_COUNTER
		let MULTITHREAD_COUNTER+=1
	done

	# Print the header of the output files
	echo -e "#gene\tphylostratum\trank\ttaxonomic_representativeness" > ${OUTDIR}/${NCBITAX}_gene_ages.tsv
	echo -e "#gene\tpossible_phylostrata" > ${OUTDIR}/${NCBITAX}_ambiguous_phylostrata.tsv
	if [[ "${PRINTOLDEST}" = "true" ]]; then
		echo -e "#qseqid\tsseqid\tevalue\tbitscore\tstaxids" > ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv
	fi

	# Run the multithreaded phylostratigraphic assignment

	echo "--------------------------------------------------"
	echo "Running Erassignment using ${THREADS} threads"

	for SUBSCRIPT in $(ls ${TMP_PATH}/Erassignment_*); do

		${SUBSCRIPT} ${DIAMONDOUT} ${CUSTOMTAX} ${NCBITAX} ${THRESHOLD} ${TMP_PATH} ${PRINTOLDEST} ${OUTDIR} ${FAST_STEP3} ${DIVERGENCE} &

	done
	wait
	rm ${TMP_PATH}/Erassignment_*
	sort ${OUTDIR}/${NCBITAX}_gene_ages.tsv > ${OUTDIR}/tmp_assignment && mv ${OUTDIR}/tmp_assignment ${OUTDIR}/${NCBITAX}_gene_ages.tsv
	if [[ "${PRINTOLDEST}" = "true" ]]; then
		sort ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv > ${OUTDIR}/tmp_deepest_homolog && mv ${OUTDIR}/tmp_deepest_homolog ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv
	fi

# Otherwise run as single-threaded job (not recommended)
else

	echo "--------------------------------------------------"
	echo "Running Erassignment using a single thread"
	echo "This WILL take a while"

	echo -e "#gene\tphylostratum\trank\ttaxonomic_representativeness" > ${OUTDIR}/${NCBITAX}_gene_ages.tsv
	echo -e "#gene\tpossible_phylostrata" > ${OUTDIR}/${NCBITAX}_ambiguous_phylostrata.tsv
	if [[ "${PRINTOLDEST}" = "true" ]]; then
		echo -e "#qseqid\tsseqid\tevalue\tbitscore\tstaxids" > ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv
	fi

	if [[ "${FAST_STEP3}" = "true" ]]; then

		# If FAST_STEP3 is setted as true with single thread turn to false
		FAST_STEP3='false'
		echo "-F (FAST_STEP3) implementation is only available for runs with multiple threads specified. Setting -F to false."

	fi

	Erassignment ${DIAMONDOUT} ${CUSTOMTAX} ${NCBITAX} ${THRESHOLD} ${TMP_PATH} ${PRINTOLDEST} ${OUTDIR} ${FAST_STEP3} ${DIVERGENCE}


fi

if [[ "${JACKHMMER}" = "true" && -f $QUERY_FASTA ]]; then

	echo "--------------------------------------------------"
	echo "The user requested an additional jackhmmer step on top of DIAMOND"
	echo "Re-analyzing all genes with a gene age rank of ${HMMERTEST} or higher using hmmEra"
	echo "Be aware that this extra step requires internet connection and is VERY SLOW"

	hmmEra -f ${QUERY_FASTA} -a ${OUTDIR}/${NCBITAX}_gene_ages.tsv -e ${EVALUE} -s ${HMMERTEST} -o ${TMP_PATH}/${NCBITAX}_HMMER_results.bout
	cat ${TMP_PATH}/${NCBITAX}_HMMER_results.bout >> ${DIAMONDOUT}

	echo "--------------------------------------------------"
	echo "Re-calculating gene ages after running hmmEra"

	if [ $THREADS -gt 1 ]; then
		SPLIT_NUM=$(wc -l ${TMP_PATH}/tmp_gene_list | cut -d' ' -f1 | while read number; do  echo $((number / $THREADS +1)); done)
		split -l $SPLIT_NUM ${TMP_PATH}/tmp_gene_list ${TMP_PATH}/tmp_gene_sublist_
		MULTITHREAD_COUNTER=1
		for SUBLIST in $(ls ${TMP_PATH} | grep "tmp_gene_sublist_"); do
			sed "s/tmp_gene_list/$SUBLIST/g" ${PHYLOSCRIPT} | sed "s/tmp_phylostrata/tmp_phylostrata_$MULTITHREAD_COUNTER/g" | sed "s/tmp_sub_phylostrata/tmp_sub_phylostrata_$MULTITHREAD_COUNTER/g" > ${TMP_PATH}/Erassignment_$MULTITHREAD_COUNTER && chmod +x ${TMP_PATH}/Erassignment_$MULTITHREAD_COUNTER
			let MULTITHREAD_COUNTER+=1
		done

		echo -e "#gene\tphylostratum\trank\ttaxonomic_representativeness" > ${OUTDIR}/${NCBITAX}_gene_ages.tsv
		echo -e "#gene\tpossible_phylostrata" > ${OUTDIR}/${NCBITAX}_ambiguous_phylostrata.tsv
		if [[ "${PRINTOLDEST}" = "true" ]]; then
			echo -e "#qseqid\tsseqid\tevalue\tbitscore\tstaxids" > ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv
		fi

		echo "Running Erassignment using ${THREADS} threads"

		for SUBSCRIPT in $(ls ${TMP_PATH}/Erassignment_*); do

			${SUBSCRIPT} ${DIAMONDOUT} ${CUSTOMTAX} ${NCBITAX} ${THRESHOLD} ${TMP_PATH} ${PRINTOLDEST} ${OUTDIR} ${DIVERGENCE} &

		done
		wait

		rm ${TMP_PATH}/Erassignment_*
		sort ${OUTDIR}/${NCBITAX}_gene_ages.tsv > ${OUTDIR}/tmp_assignment && mv ${OUTDIR}/tmp_assignment ${OUTDIR}/${NCBITAX}_gene_ages.tsv
		if [[ "${PRINTOLDEST}" = "true" ]]; then
			sort ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv > ${OUTDIR}/tmp_deepest_homolog && mv ${OUTDIR}/tmp_deepest_homolog ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv
		fi

	else

		echo "Running Erassignment using a single thread"
		echo "This WILL take a while"

		echo -e "#gene\tphylostratum\trank\ttaxonomic_representativeness" > ${OUTDIR}/${NCBITAX}_gene_ages.tsv
		echo -e "#gene\tpossible_phylostrata" > ${OUTDIR}/${NCBITAX}_ambiguous_phylostrata.tsv
		if [[ "${PRINTOLDEST}" = "true" ]]; then
			echo -e "#qseqid\tsseqid\tevalue\tbitscore\tstaxids" > ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv
		fi

		Erassignment ${DIAMONDOUT} ${CUSTOMTAX} ${NCBITAX} ${THRESHOLD} ${TMP_PATH} ${PRINTOLDEST} ${OUTDIR} ${DIVERGENCE}

	fi

elif [[ "${JACKHMMER}" = "true" && -d $QUERY_PDB ]]; then

	echo "--------------------------------------------------"
	echo "The user requested an additional jackhmmer step on top of Foldseek"
	echo "Unfortunately, jackhmmer does not accept PDB files"
	echo "Therefore, this step will be skipped"

fi

# Calculate the number of genes per phylostratum
echo -e "#number_of_genes\tphylostratum\tphylorank" > ${OUTDIR}/${NCBITAX}_gene_age_summary.tsv
QPHYLOS=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${CUSTOMTAX})
PHYLORANK=$(echo ${QPHYLOS} | sed "s/,/\n/g" | tail -n +2 | wc -l)
IFS=$'\n'
for STRATUM in $(echo ${QPHYLOS} | sed "s/,/\n/g" | tail -n +2); do
	NUMGENES=$(awk -F "\t" -v STRATUM="$STRATUM" '{ if ($2 == STRATUM) print $0 }' ${OUTDIR}/${NCBITAX}_gene_ages.tsv | wc -l)
	echo -e "${NUMGENES}\t${STRATUM}\t${PHYLORANK}" >> ${OUTDIR}/${NCBITAX}_gene_age_summary.tsv
	let PHYLORANK-=1
done

rm ${TMP_PATH}/tmp_gene_*

if [[ -f ${TMP_PATH}/tmp_${NCBITAX}_sequences ]]; then
	rm ${TMP_PATH}/tmp_${NCBITAX}_sequences
fi

echo "--------------------------------------------------"
echo "Running mcl to define gene families"

# Run MCL to predict gene families from the query self-match in step 1
mcxload -abc ${TMP_PATH}/tmp_${NCBITAX}.abc --stream-mirror --stream-neg-log10 -stream-tf 'ceil(200)' -o ${TMP_PATH}/tmp_${NCBITAX}.mci -write-tab ${TMP_PATH}/tmp_${NCBITAX}.tab
mcl ${TMP_PATH}/tmp_${NCBITAX}.mci -I 1.5 -te ${THREADS} -o ${TMP_PATH}/tmp_${NCBITAX}.mcl -V all
mcxdump -icl ${TMP_PATH}/tmp_${NCBITAX}.mcl -tabr ${TMP_PATH}/tmp_${NCBITAX}.tab -o ${TMP_PATH}/tmp_${NCBITAX}.dump

# Eliminate all the temporary files
rm ${TMP_PATH}/tmp_${NCBITAX}.abc ${TMP_PATH}/tmp_${NCBITAX}.mci ${TMP_PATH}/tmp_${NCBITAX}.tab ${TMP_PATH}/tmp_${NCBITAX}.mcl

echo "--------------------------------------------------"
echo "Establishing the age and number of gene-family founder events"

# Generate a table with the oldest assignable phylostratum per gene family and gene-family size
echo -e "#gene_family\tphylostratum\trank\tfamily_size" > ${OUTDIR}/${NCBITAX}_founder_events.tsv

IFS=$'\n'
for GENECLUSTER in $(cat ${TMP_PATH}/tmp_${NCBITAX}.dump); do

	GENEFAMILY=$(echo ${GENECLUSTER} | sed 's/\t/,/g')
	FAMILYSIZE=$(echo ${GENECLUSTER} | sed 's/\t/\n/g' | wc -l)
	FOUNDER=$(echo ${GENECLUSTER} | sed 's/\t/\n/g' | awk 'FNR==NR{ a[$1]=$0;next } ($1 in a)' - ${OUTDIR}/${NCBITAX}_gene_ages.tsv | sort -T ${TMP_PATH} -t$'\t' -nk3 | grep -v "Possible contamination or HGT" | head -1 | cut -d'	' -f2,3)

	if [[ -z ${FOUNDER} ]]; then
		echo -e "$GENEFAMILY\tPossible contamination or HGT\tNA\t$FAMILYSIZE" >> ${OUTDIR}/${NCBITAX}_founder_events.tsv
	else
		echo -e "$GENEFAMILY\t$FOUNDER\t$FAMILYSIZE" >> ${OUTDIR}/${NCBITAX}_founder_events.tsv
	fi
done

rm ${TMP_PATH}/tmp_${NCBITAX}.dump

# Summarize the number of gene founder events per phylostratum
echo -e "#number_of_founders\tphylostratum\tphylorank" > ${OUTDIR}/${NCBITAX}_founder_summary.tsv
QPHYLOS=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${CUSTOMTAX})
PHYLORANK=$(echo ${QPHYLOS} | sed "s/,/\n/g" | tail -n +2 | wc -l)
IFS=$'\n'
for STRATUM in $(echo ${QPHYLOS} | sed "s/,/\n/g" | tail -n +2); do
	NUMFOUNDERS=$(awk -F "\t" -v STRATUM="$STRATUM" '{ if ($2 == STRATUM) print $0 }' ${OUTDIR}/${NCBITAX}_founder_events.tsv | wc -l)
	echo -e "${NUMFOUNDERS}\t${STRATUM}\t${PHYLORANK}" >> ${OUTDIR}/${NCBITAX}_founder_summary.tsv
	let PHYLORANK-=1
done

echo "--------------------------------------------------"
echo "Step 3 finished!"
echo "The age assignment for your individual genes can be found in ${OUTDIR}/${NCBITAX}_gene_ages.tsv"
if [[ "${PRINTOLDEST}" = "true" ]]; then
	echo "The deepest traceable homologs for each of your genes can be found in ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv"
fi
echo "The possible ages for the genes with a taxonomic representativeness below ${THRESHOLD} percent can be found in ${OUTDIR}/${NCBITAX}_ambiguous_phylostrata.tsv"
echo "The estimation of gene family founder events can be found in ${OUTDIR}/${NCBITAX}_founder_events.tsv"
echo "The number of individual genes that could be assigned to each phylostratum are summarized in ${OUTDIR}/${NCBITAX}_gene_age_summary.tsv"
echo "The number of of gene family founder events per phylostratum are summarized in ${OUTDIR}/${NCBITAX}_founder_summary.tsv"

#######################################################
###                 GenEra: STEP 4                  ###
#######################################################

# Run abSENSE, if the user invokes it with -s
if [[ -f ${DIVERGENCE} ]]; then

	# Check that the abSENSE input file is not empty
	if [ -s ${DIVERGENCE} ]; then
		echo
		echo "STARTING STEP 4: CALCULATING DETECTION FAILURE PROBABILITIES WITH abSENSE.py"
	else
		echo
		echo "  ERROR: The file with the pairwise evolutionary distances is empty"
		echo "  Exiting"
		echo
		exit 1
	fi

	# Split the input files in accordance with the number of threads
	if [ "${THREADS}" -gt 1 ]; then
		head -1 ${OUTDIR}/${NCBITAX}_bitscores.tsv > ${TMP_PATH}/bitscores_header
		head -1 ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths > ${TMP_PATH}/gene_lengths_header
		tail -n +2 ${OUTDIR}/${NCBITAX}_bitscores.tsv | sort > ${TMP_PATH}/tmp_${NCBITAX}_bitscores
		tail -n +2 ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths | sort > ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_no_header
		split -l $SPLIT_NUM ${TMP_PATH}/tmp_${NCBITAX}_bitscores ${TMP_PATH}/tmp_${NCBITAX}_bitscores_
		split -l $SPLIT_NUM ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_no_header ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_no_header_

		# Rename the input files according to the number of threads
		MULTITHREAD_COUNTER=1
		for SUBLIST in $(ls ${TMP_PATH} | grep "tmp_${NCBITAX}_bitscores_"); do
			SUFIXTMP=$(echo ${SUBLIST} | cut -d "_" -f4)
			cat ${TMP_PATH}/bitscores_header > ${TMP_PATH}/${NCBITAX}_bitscores_${MULTITHREAD_COUNTER}.tsv && cat ${TMP_PATH}/tmp_${NCBITAX}_bitscores_${SUFIXTMP} >> ${TMP_PATH}/${NCBITAX}_bitscores_${MULTITHREAD_COUNTER}.tsv
			cat ${TMP_PATH}/gene_lengths_header > ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_${MULTITHREAD_COUNTER} && cat ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_no_header_${SUFIXTMP}  >> ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_${MULTITHREAD_COUNTER}
			let MULTITHREAD_COUNTER+=1
		done

		# Eliminate all temporary files generated earlier
		rm ${TMP_PATH}/bitscores_header ${TMP_PATH}/gene_lengths_header ${TMP_PATH}/tmp_${NCBITAX}_bitscores ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_no_header ${TMP_PATH}/tmp_${NCBITAX}_bitscores_* ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_no_header_*

		# Run abSENSE using multiple threads

		echo "--------------------------------------------------"
		echo "Running abSENSE using ${THREADS} threads"

		MULTITHREAD_COUNTER=1
		while [[ ${MULTITHREAD_COUNTER} -le ${THREADS} ]]; do
			Run_abSENSE.py --distfile ${DIVERGENCE} --scorefile ${TMP_PATH}/${NCBITAX}_bitscores_${MULTITHREAD_COUNTER}.tsv --Eval ${EVALUE} --genelenfile ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_${MULTITHREAD_COUNTER} --out ${TMP_PATH}/${MULTITHREAD_COUNTER}_abSENSE_results >> ${OUTDIR}/log_${NCBITAX}_abSENSE &
			let MULTITHREAD_COUNTER+=1
		done
		wait

		# Merge all the abSENSE subfiles in a single directory
		mkdir ${OUTDIR}/${NCBITAX}_abSENSE_results
		head -2 ${TMP_PATH}/1_abSENSE_results/Bitscore_99PI_lowerbound_predictions > ${OUTDIR}/${NCBITAX}_abSENSE_results/Bitscore_99PI_lowerbound_predictions
		head -2 ${TMP_PATH}/1_abSENSE_results/Bitscore_99PI_higherbound_predictions > ${OUTDIR}/${NCBITAX}_abSENSE_results/Bitscore_99PI_higherbound_predictions
		head -2 ${TMP_PATH}/1_abSENSE_results/Detection_failure_probabilities > ${OUTDIR}/${NCBITAX}_abSENSE_results/Detection_failure_probabilities
		head -2 ${TMP_PATH}/1_abSENSE_results/Parameter_values > ${OUTDIR}/${NCBITAX}_abSENSE_results/Parameter_values
		head -2 ${TMP_PATH}/1_abSENSE_results/Predicted_bitscores > ${OUTDIR}/${NCBITAX}_abSENSE_results/Predicted_bitscores

		MULTITHREAD_COUNTER=1
		while [[ ${MULTITHREAD_COUNTER} -le ${THREADS} ]]; do
			tail -n +3 ${TMP_PATH}/${MULTITHREAD_COUNTER}_abSENSE_results/Bitscore_99PI_lowerbound_predictions >> ${OUTDIR}/${NCBITAX}_abSENSE_results/Bitscore_99PI_lowerbound_predictions
			tail -n +3 ${TMP_PATH}/${MULTITHREAD_COUNTER}_abSENSE_results/Bitscore_99PI_higherbound_predictions >> ${OUTDIR}/${NCBITAX}_abSENSE_results/Bitscore_99PI_higherbound_predictions
			tail -n +3 ${TMP_PATH}/${MULTITHREAD_COUNTER}_abSENSE_results/Detection_failure_probabilities >> ${OUTDIR}/${NCBITAX}_abSENSE_results/Detection_failure_probabilities
			tail -n +3 ${TMP_PATH}/${MULTITHREAD_COUNTER}_abSENSE_results/Parameter_values >> ${OUTDIR}/${NCBITAX}_abSENSE_results/Parameter_values
			tail -n +3 ${TMP_PATH}/${MULTITHREAD_COUNTER}_abSENSE_results/Predicted_bitscores >> ${OUTDIR}/${NCBITAX}_abSENSE_results/Predicted_bitscores
			rm ${TMP_PATH}/${NCBITAX}_bitscores_${MULTITHREAD_COUNTER}.tsv ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_${MULTITHREAD_COUNTER} ${TMP_PATH}/${MULTITHREAD_COUNTER}_abSENSE_results/* && rmdir ${TMP_PATH}/${MULTITHREAD_COUNTER}_abSENSE_results
			let MULTITHREAD_COUNTER+=1
		done

		mv ${OUTDIR}/log_${NCBITAX}_abSENSE ${OUTDIR}/${NCBITAX}_abSENSE_results/

	# Otherwise run as single-threaded job (not recommended)
	else

		echo "--------------------------------------------------"
		echo "Running abSENSE using a single thread"
		echo "This WILL take another while"

		Run_abSENSE.py --distfile ${DIVERGENCE} --scorefile ${OUTDIR}/${NCBITAX}_bitscores.tsv --Eval ${EVALUE} --genelenfile ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths --out ${OUTDIR}/${NCBITAX}_abSENSE_results > ${OUTDIR}/log_${NCBITAX}_abSENSE
		mv ${OUTDIR}/log_${NCBITAX}_abSENSE ${OUTDIR}/${NCBITAX}_abSENSE_results/

	fi

	echo "--------------------------------------------------"
	echo "Creating a list of high-confidence gene founder events (homology detection failure probabilities < 0.05 in the closest outgroup)"

	mv ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths ${OUTDIR}/${NCBITAX}_abSENSE_results/${NCBITAX}_gene_lengths.tsv

	# Print the headers for the output file of the high-confidence set of genes based on homology detection failure
	echo -e "#gene\tphylostratum\trank\ttaxonomic_representativeness" > ${OUTDIR}/${NCBITAX}_HDF_gene_ages.tsv
	echo -e "#gene_family\tphylostratum\trank\tfamily_size" > ${OUTDIR}/${NCBITAX}_HDF_founder_events.tsv

	# Extract the list of phylostrata in the query species
	QPHYLOS=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${CUSTOMTAX})
	PHYLORANK=$(echo ${QPHYLOS} | sed "s/,/\n/g" | tail -n +2 | wc -l)

	# Print headers for the summary files
	echo -e "#number_of_genes\tphylostratum\tphylorank\toutgroup" > ${OUTDIR}/${NCBITAX}_HDF_gene_age_summary.tsv
	echo ${QPHYLOS} | awk -F "," -v PHYLORANK="$PHYLORANK" '{ print "NA\t"$2"\t"PHYLORANK"\tNA" }' >> ${OUTDIR}/${NCBITAX}_HDF_gene_age_summary.tsv
	echo -e "#number_of_founders\tphylostratum\tphylorank\toutgroup" > ${OUTDIR}/${NCBITAX}_HDF_founder_summary.tsv
	echo ${QPHYLOS} | awk -F "," -v PHYLORANK="$PHYLORANK" '{ print "NA\t"$2"\t"PHYLORANK"\tNA" }' >> ${OUTDIR}/${NCBITAX}_HDF_founder_summary.tsv

	# Create a list with the taxonomy of all the taxa that were included in the list with evolutionary distances
	awk '{ print $1 }' ${DIVERGENCE} | awk -F "," 'FNR==NR{ a[$1]=$0;next } ($1 in a)' - ${CUSTOMTAX} | grep "$OLDEST" > ${TMP_PATH}/tmp_taxa_distances

	# Create a counter for the homology detection failure test
	COUNTER_HDF=3
	let PHYLORANK-=1

	IFS=$'\n'
	for STRATUM in $(echo ${QPHYLOS} | sed "s/,/\n/g" | tail -n +3); do

		# Extract the position and name of the outgroup phylostratum in the custom NCBI_lineages file
		COUNTER_OUTGROUP=$((${COUNTER_HDF} + 1))
		OUTGROUP_STRATUM=$(echo ${QPHYLOS} | awk -F "," -v COUNTER_OUTGROUP="$COUNTER_OUTGROUP" '{ print $COUNTER_OUTGROUP }')

		# Extract the list of outgroup species that can be used to test each of the phylostrata in the analysis
		awk -F "," -v COUNTER_HDF="$COUNTER_HDF" -v STRATUM="$STRATUM" -v COUNTER_OUTGROUP="$COUNTER_OUTGROUP" -v OUTGROUP_STRATUM="$OUTGROUP_STRATUM" '{ if ($COUNTER_HDF != STRATUM && $COUNTER_OUTGROUP == OUTGROUP_STRATUM) print $1 }' ${TMP_PATH}/tmp_taxa_distances > ${TMP_PATH}/tmp_outgroups

		# Obtain the outgroup for each phylostratum with the lowest evolutionary distance to the target species
		HDF_OUTGROUP=$(grep -w -f ${TMP_PATH}/tmp_outgroups ${DIVERGENCE} | sort -T ${TMP_PATH} -t$'\t' -nk2 | head -1 | cut -d "	" -f1)

		# Obtain the column in the bitscore file that corresponds to the outgroup of each phylostratum, so it can be used for the homology detection failure test
		OUTGROUP_COLUMN=$(head -1 ${OUTDIR}/${NCBITAX}_bitscores.tsv | sed -n $'1s/\t/\\\n/gp' | grep -w -nx "$HDF_OUTGROUP" | cut -d: -f1)

		# Print all the gene founder events that passed the homology detection failure test
		awk -F "\t" -v STRATUM="$STRATUM" '{ if ($2 == STRATUM) print $1 }' ${OUTDIR}/${NCBITAX}_gene_ages.tsv > ${TMP_PATH}/tmp_level_${COUNTER_HDF}_genes.txt
		grep -v "#" ${OUTDIR}/${NCBITAX}_abSENSE_results/Detection_failure_probabilities | awk -F "\t" -v OUTGROUP_COLUMN="$OUTGROUP_COLUMN" '{ if ($OUTGROUP_COLUMN <= 0.05) print $1 }' | grep -w -f ${TMP_PATH}/tmp_level_${COUNTER_HDF}_genes.txt > ${TMP_PATH}/tmp_HDF_${COUNTER_HDF} && grep -f ${TMP_PATH}/tmp_HDF_${COUNTER_HDF} ${OUTDIR}/${NCBITAX}_gene_ages.tsv >> ${OUTDIR}/${NCBITAX}_HDF_gene_ages.tsv
		awk -F "\t" -v STRATUM="$STRATUM" '{ if ($2 == STRATUM) print $0 }' ${OUTDIR}/${NCBITAX}_founder_events.tsv | grep -f ${TMP_PATH}/tmp_HDF_${COUNTER_HDF} >> ${OUTDIR}/${NCBITAX}_HDF_founder_events.tsv
		rm ${TMP_PATH}/tmp_level_${COUNTER_HDF}_genes.txt ${TMP_PATH}/tmp_HDF_${COUNTER_HDF} ${TMP_PATH}/tmp_outgroups

		# Create the summary files for the gene age assignment and the gene-family founder events
		if [[ -z ${HDF_OUTGROUP} ]]; then

			echo -e "NA\t${STRATUM}\t${PHYLORANK}\tNO OUTGROUP" >> ${OUTDIR}/${NCBITAX}_HDF_founder_summary.tsv
			echo -e "NA\t${STRATUM}\t${PHYLORANK}\tNO OUTGROUP" >> ${OUTDIR}/${NCBITAX}_HDF_gene_age_summary.tsv

		else

			NUMGENES=$(awk -F "\t" -v STRATUM="$STRATUM" '{ if ($2 == STRATUM) print $0 }' ${OUTDIR}/${NCBITAX}_HDF_gene_ages.tsv | wc -l)
			NUMFOUNDERS=$(awk -F "\t" -v STRATUM="$STRATUM" '{ if ($2 == STRATUM) print $0 }' ${OUTDIR}/${NCBITAX}_HDF_founder_events.tsv | wc -l)
			echo -e "${NUMFOUNDERS}\t${STRATUM}\t${PHYLORANK}\t${HDF_OUTGROUP}" >> ${OUTDIR}/${NCBITAX}_HDF_founder_summary.tsv
			echo -e "${NUMGENES}\t${STRATUM}\t${PHYLORANK}\t${HDF_OUTGROUP}" >> ${OUTDIR}/${NCBITAX}_HDF_gene_age_summary.tsv

		fi

		# Modify the counters and continue with the loop
		let COUNTER_HDF+=1
		let PHYLORANK-=1

	done

	rm ${TMP_PATH}/tmp_taxa_distances
	mv ${OUTDIR}/${NCBITAX}_bitscores.tsv ${OUTDIR}/${NCBITAX}_abSENSE_results/

	echo "--------------------------------------------------"
	echo "Step 4 finished!"
	echo "The detection failure probabilities for each for your genes can be found in ${OUTDIR}/${NCBITAX}_abSENSE_results/Detection_failure_probabilities"
	echo "The list of gene age assignments that cannot be explained by homology detection failure can be found in ${OUTDIR}/${NCBITAX}_HDF_gene_ages.tsv"
	echo "The list of gene-family founder events that cannot be explained by homology detection failure can be found in ${OUTDIR}/${NCBITAX}_HDF_founder_events.tsv"
	echo "The summary files for this analysis are found in ${OUTDIR}/${NCBITAX}_HDF_gene_age_summary.tsv and ${OUTDIR}/${NCBITAX}_HDF_founder_summary.tsv"
	echo "Please refer to the following paper for an in-depth interpretation of homology detection failure probabilities:"
	echo "Weisman, C.M., Murray, A.W., & Eddy, S.R. (2020). Many, but not all, lineage-specific genes can be explained by homology detection failure. PLoS biology, 18(11), e3000862."

fi

echo
echo "genEra finished at:"
date
echo
echo "Enjoy your results!!!"