#!/usr/bin/env bds
# This is P.E.M.A!  A metabarcoding pipenline for environmental DNA samples.
# Author: Haris Zafeiropoulos 
# date: 2017-2018
# This script is meant for the Singularity image version of P.E.M.A.


#############################################################################################
#########          FUNCTION IN ORDER TO READ parameters.tsv FILE EACH TIME      #############
#############################################################################################

# first of all, read the parameters' file and keep all of them in a map (map is a dictionary for BDS)
# in order to be able to make run from checkpoints, we made a function in order to read the parameter's file all the time
string{} readParameterFile(string parameter_file) {

#  each time i need to change..... 
	string parameters_file = parameter_file
	lines := parameters_file.readLines()
	string{} my_case

	for ( string line : lines ) {
	
		# I need a map with parameteres as keys and their values as values, hence i do not care about lines starting with "#"
		if ( line.startsWith("#") ) {	
		continue
	
		} else {
			string pname, pvalue
			
			(pname, pvalue) = line.split('\t')		# tab - split 
			my_case { pname } = pvalue				# keep 1st elemenet as key (parameter) and the second as its value (parameter's value)
			pname = ''
			pvalue = ''
		}
	}
	
	# return the map 
	return( my_case )								
}

###############################################################################################
################     set PEMA's environment to be able to be executed     #####################
###############################################################################################

# read parameters file for the first time! 
string parameterFilePath = "/mnt/analysis/"
parameterFilePath.chdir()

# set a variable with the parameter file as its value
string f = "parameters.tsv"									
string{} paramsFirstAssignment =  readParameterFile(f)

# keep as a variable the path where PEMA is found
string path = "/home"

# keep as a variable the path where the data are found
string dataPath =  "/mnt/analysis/mydata" 

# keep as a variable the path of a BLAST database 
#string blastDatabasePath = "/mnt/databases"

# PEMA will leave its output also in /mnt 
string outputPoint = "/mnt/analysis"

# Quite IMPORTANT step!
# if there are Trimlogs in the rawData file, PEMA does not run as it should. Hence, we remove them if there are any!
# This happens when i re-run PEMA over the same dataset
dataPath.chdir() ;

# check if there is a TrimLog file and IF YES, then delete it
sys if [ $(find "$dataPath" -name "TrimLog*") ] ; then  rm TrimLog* ; fi          

# in case that PEMA run on macOS, then some ".DS_Store" files may appear; these need to be removed in every step of the way!
sys if [ $(find "$dataPath" -name "*.DS_Store*") ] ; then  rm .*[DS_]* ; fi

# I create a file where the pipeline's output will be saved
outputPoint.chdir()

# Check if an output folder called with the same name as the one given in the "parameters" file, has already been created from previous runs
if ( paramsFirstAssignment{'outputFolderName'}.isDir() ) {												 
   print "this ouput file already exists\n" 
} else {
	# if it is not, create a file with its name
	paramsFirstAssignment{'outputFolderName'}.mkdir()
	print "a new output files was just created!\n"
}

# take the outut folder name as a variable 
string analysisName = paramsFirstAssignment{'outputFolderName'}

# here are all the directories as variables that i will use
string qualityControl = "1.quality_control" ;
string trimmomatic = "2.trimmomatic_output" ;
string bayesHammer = "3.correct_by_BayesHammer"
string spades = "4.merged_by_PANDAseq" ;
string dereplicate = "5.dereplicate_by_obiuniq" ;
string linearized = "6.linearized_files" ;
string geneDependent = "7.gene_dependent"
string outputPerSample = "8.output_per_sample"

 
## and all the paths for each of them
string outputFilePath = outputPoint + "/" + paramsFirstAssignment{'outputFolderName'}
string fastqcPath = outputFilePath +"/" + qualityControl ; 
string trimoPath = outputFilePath +"/" + trimmomatic ;
string bayesPath =  outputFilePath +"/" + bayesHammer ;
string spaPath =  outputFilePath +"/" +  spades ;
string derePath =  outputFilePath +"/" + dereplicate ;
string linearPath =  outputFilePath +"/" + linearized ;
string genePath = outputFilePath +"/" + geneDependent ;
string outputPerSamplePath = outputFilePath +"/" + outputPerSample

# and create an output file for each pipeline step - if you run PEMA for the first time for a specific experiment
outputFilePath.chdir()
# make a list with all the directory names that can be found in the output directory 
string [] outputFiles = outputFilePath.dir()			

# if that list is empty, then create the directories mentioned above
if ( outputFiles.isEmpty() == true ) {																
   # create all the output files
   qualityControl.mkdir() ;
   trimmomatic.mkdir() ;
   bayesHammer.mkdir() ;
   spades.mkdir() ;
   dereplicate.mkdir() ;
   linearized.mkdir() ;
   geneDependent.mkdir()
   outputPerSample.mkdir()
  }

# gene dependent directory
genePath.chdir()																									
paramsFirstAssignment{'gene'}.mkdir()														
wait

##########################################################################################
#																						 #
#		  	From this point, PEMA and its tools are about to start running         	     #
#																						 #
##########################################################################################

##########################################################################################
##########################		1. FastQC			######################################
##########################################################################################

 # If there are Trimlogs in the rawData file, PEMA does not run as it should. Hence, we remove them if there are any!
 # I re-run this command because of FastQC and the way it reacts in multiple runs over the same folders
 dataPath.chdir() ; 
 sys if [ $(find "$dataPath" -name "TrimLog*") ] ; then  rm TrimLog* ; fi

# make a copy of "parameters.tsv" file to the output folder.
# this way the user will be able to get the parameters he used for analysis, anytime!
sys cp $parameterFilePath/parameters.tsv $outputFilePath/parameters0f.$analysisName.tsv

# now BLASTQC runs for all files
string[] rawFiles = dataPath.dir()
for ( string rawFile : rawFiles ) {
	print(rawFile)
	task $path/tools/fastqc/FastQC/fastqc --outdir $fastqcPath  $dataPath/$rawFile
}
wait
print("FastQC is completed!" + "\n")


 # now i create a checkpoint and read again the parameters file.
 # this way if i change any of my parameters, i can do only the steps from here and after with the new ones.
 if ( qualityControl.isEmpty() == false ) {
 	checkpoint "trimming.chp"
}
			
parameterFilePath.chdir()
string g = "parameters.tsv"
string{} paramsTrimmoStep =  readParameterFile(g)
 
###########################################################################################
###########################			2. Trimmomatic			###############################
###########################################################################################

dataPath.chdir() ;

#################      step only for the case of the ITS marker gene      ##################

if ( paramsTrimmoStep{'gene'} == 'gene_ITS' ){
	
	dataPath.chdir()
	string cutadaptOutputDirectory = "cutadapt"
	string initialData = "initialData"
	if ( cutadaptOutputDirectory.isDir() && initialData.isDir() ) {												 
	   print " done that! \n" 
	} else {
		# if it is not, create a file with its name
		cutadaptOutputDirectory.mkdir()
		initialData.mkdir()
		print "we made an output directory for cutadapt \n"
	}	
	
	wait
	task /usr/bin/R-3.6.0/bin/Rscript $path/scripts/cutadaptITS.R $paramsTrimmoStep{'forwardITSPrimer'} $paramsTrimmoStep{'reverseITSPrimer'} $dataPath
	
	wait
	
	sys mv *.fastq.gz initialData
	sys mv $initialData $outputPoint/$analysisName
	sys mv cutadapt/* .
	sys rm -r cutadapt
	
	wait

}

###########################    here ends the ITS - specific part  !!    #####################################

# go to the folder which contains all the raw data and make a list (named 'data') with all the file names that you can find there
string[] data = dataPath.dir()											
  
# set two variables readF and readR for the forward and reverse file of each sample  
string readF																			
string readR
int counter = 0

for ( string file : data ) {
	
	# here's the trick! i need to tell which file is the forward and which is the reverese. Hence, i try to split a suffix. 
	string check = file.split("_2\.fastq\.gz")[0]				

	# if the split does occure then i know that i have the second file of each sample (reverse)
	# that is because the 'file' variable will always have a '.fastq.gz' suffix.
	# hence, if there is no split, 'file' and 'check'  variables are the same.
	if ( file == check ) {															
				readF = file 																		
				print("readF is: " + readF  +"\n")
				
				# if there is a split, then they are different
				} else {
					readR = file
					print("readR is: " + readR  +"\n")
			}
												
	wait

	if ( readF.isEmpty() == false && readR.isEmpty() == false ) {

		if ( paramsTrimmoStep{'maxInfo'} == 'Yes' ) {

			print("You hace selected Max INFO" + "\n")
			wait
			task /usr/lib/jvm/java-8-openjdk-amd64/bin/java -jar $path/tools/Trimmomatic/Trimmomatic-0.38/trimmomatic-0.38.jar PE -threads $paramsTrimmoStep{'threadsTrimmomatic'} -trimlog TrimLog $dataPath/$readF $dataPath/$readR $trimoPath/filtered_max_$readF.1P.fastq.gz $trimoPath/filtered_max_$readF.1U.fastq.gz $trimoPath/filtered_max_$readR.2P.fastq.gz $trimoPath/filtered_max_$readR.2U.fastq.gz ILLUMINACLIP:/$path/tools/Trimmomatic/Trimmomatic-0.38/adapters/$paramsTrimmoStep{'adapters'}:$paramsTrimmoStep{'seedMismatches'}:$paramsTrimmoStep{'palindromeClipThreshold'}:$paramsTrimmoStep{'simpleClipThreshold'} LEADING:$paramsTrimmoStep{'leading'} TRAILING:$paramsTrimmoStep{'trailing'} MAXINFO:$paramsTrimmoStep{'targetLength'}:$paramsTrimmoStep{'strictness'} MINLEN:$paramsTrimmoStep{'minlen'}
			wait
			readF = ''
			readR = ''

			} else if ( paramsTrimmoStep{'maxInfo'} == 'No' ) {

					print("You have selected no Max INFO" + "\n")

					task /usr/lib/jvm/java-8-openjdk-amd64/bin/java -jar $path/tools/Trimmomatic/Trimmomatic-0.38/trimmomatic-0.38.jar PE -threads $paramsTrimmoStep{'threadsTrimmomatic'} -trimlog TrimLog2 $dataPath/$readF $dataPath/$readR $trimoPath/filtered_$readF.1P.fastq.gz $trimoPath/filtered_$readF.1U.fastq.gz $trimoPath/filtered_$readR.2P.fastq.gz $trimoPath/filtered_$readR.2U.fastq.gz ILLUMINACLIP:/$path/tools/Trimmomatic/Trimmomatic-0.38/adapters/$paramsTrimmoStep{'adapters'}:$paramsTrimmoStep{'seedMismatches'}:$paramsTrimmoStep{'palindromeClipThreshold'}:$paramsTrimmoStep{'simpleClipThreshold'} LEADING:$paramsFirstAssignment{'leading'} TRAILING:$paramsTrimmoStep{'trailing'}  MINLEN:$paramsTrimmoStep{'minlen'}

					wait

					readF = ''
					readR = ''
			}
	}
}

wait
print "Trimmomatic  is done\n\n" ;


#  let us make a CHECKPOINT from which will be able to restart our analysis from the error correction sterp
path.chdir() ; trimoPath.chdir() ;
if ( trimoPath.isEmpty() == false ) {
	checkpoint "errorCorrection.chp"
}

parameterFilePath.chdir()
string h = "parameters.tsv"
string{} paramsBayesHammerStep =  readParameterFile(h)
 
################################################################################################
##################  3.  BayesHammer - error correction // SPAdes    ############################
################################################################################################
 
# make a list with all the file names that there are in the folder with Trimmomatic's
# output -- e.g: suffix : '2.fastq.gz.2U.fastq.gz' 
trimoPath.chdir()

sys if [ $(find "$trimoPath" -name "*.DS_Store*") ] ; then  rm .*[DS_]* ; fi

string[] trimos = trimoPath.dir()										
 
for (string file : trimos ) {
	# we delete all the 'U.fastq.gz' files using the list we just made
	if ( file.endsWith("U\.fastq\.gz") ) {			
		file.delete()
	}
}

# and then we make a new list with the files that remained 
string[] finalTrimos = trimoPath.dir()				

string readBayesF
string readBayesR
string newDir
string newFile
wait

for ( string trimmed : finalTrimos ) {
	
	# as in Trimmomatic step, we need to know the forward and the reverse file
	string trimCheck = trimmed.split("_2\.fastq\.gz\.2P\.fastq.\gz")[0]							

	if ( trimmed == trimCheck ) {
		readBayesF = trimmed
		} else {
		readBayesR = trimmed
		}

	if ( readBayesF.isEmpty() == false && readBayesR.isEmpty() == false ) {
		
		if ( paramsBayesHammerStep{'maxInfo'} == 'Yes' ) {
			string newName = trimCheck.split("filtered\_max\_")
			newDir = newName.substr(3,13) ;
			print("trimCheck: " + trimCheck + "\n" + "newDir: " + newDir + "\n")

		} else if ( paramsBayesHammerStep{'maxInfo'} == 'No' ) {
			string newName = trimCheck.split("filtered_")
			newDir = newName.substr(3,13) ;
			print("trimCheck: " + trimCheck + "\n" + "newDir: " + newDir + "\n")
		}
		
		# working where the output of BayesHammer is located!
		bayesPath.chdir() ; 
		newDir.mkdir() ;
		
		# going back to Trimo's output
		trimoPath.chdir()
		wait

		# and I execute the BayesHammer algorithm that it is in SPAdes
		
		sys which gzip
		sys gzip --version
		print("readBayesF: " + readBayesF +"\n" + "and readBayesR: " + readBayesR + "\n" )
		
		task $path/tools/SPAdes/SPAdes-3.13.0-Linux/bin/spades.py --only-error-correction -o $bayesPath/$newDir -1 $readBayesF -2 $readBayesR --disable-gzip-output

		wait
	
		# clear all variables that are used in BayesHammer step in order to be re-used
		readBayesF = ''
		readBayesR = ''		
		newFile = ''
		newDir = ''		
	}
}

wait
print("Error correction using BayesHammer is completed!" + "\n") 
 
#  let us make a checkpoint to start our pipeline from the  merging step!
if ( bayesPath.isEmpty() == false ) {
	checkpoint "mergingWithSpades.chp"
	}

parameterFilePath.chdir()
string i = "parameters.tsv"
string{} paramsSpadesMerging =  readParameterFile(i)
 
#############################################################################################
############################  4. PANDAseq - merging/overlapping    ##########################
#############################################################################################

bayesPath.chdir()

sys if [ $(find "$bayesPath" -name "*.DS_Store*") ] ; then  rm .*[DS_]* ; fi
 
# keep in a list all files that came from the BayesHammer algo leaving aside some stats file
# make a list with all the folders that exist in Bayes Hammer output folder - only sample names (e.g. SRR1643855 ) 
string[] correct = bayesPath.dir()											
correct.remove("input_dataset.yaml") ;
correct.remove( "params.txt" ) ;
correct.remove("spades.log") ;
correct.remove( "tmp")

# we need to "create" the names of forward and reverse files each time
string baseName								
if ( paramsSpadesMerging{'maxInfo'} == 'Yes' ) {
	
	# we keep as 'baseName' the prefix of every file that depends on whether  we had maxInfo or not
	baseName = "filtered_max_"
} else {
	baseName = "filtered_"
}
 
for ( string correctFile : correct )  {

	string uniqPath = bayesPath + '/' +  correctFile + '/' + 'corrected' ;
	# we enter to the directory of a specific sample each time in the output folder of BayesHammer - previous step
	uniqPath.chdir() ;																					
	
	sys /bin/gzip *.fastq
	
	# in the corrected file of each sample there are two files - one for forward, one for reverse -
	# and I make two variables with them
	string forward = "_1.fastq.gz.1P.fastq.00.0_0.cor.fastq.gz"								
	string reverse = "_2.fastq.gz.2P.fastq.00.0_0.cor.fastq.gz"		
	string forwardFile= baseName + correctFile + forward ;
	print(forwardFile + "\n")
	string reverseFile = baseName + correctFile + reverse ;
	print(reverseFile + "\n")

	wait
	
	# if i have sample names not in terms of Illumina and I need to use PANDAseq, i have to change the sample names in the rawdata as "ERR0001" for example.
	if ( paramsSpadesMerging{'EnaData'} == 'No') {
		uniqPath.chdir() 
		sys bash  $path/scripts/prepareForPandaseq.sh $forwardFile $reverseFile	
	}
	
	# finally, we execute pandaseq algorithm for merging 
	task $path/tools/PANDAseq/bin/pandaseq -f $forwardFile -r $reverseFile -6 -A $paramsSpadesMerging{'pandaseqAlgorithm'} -a -B -T $paramsSpadesMerging{'pandaseqThreads'} -o $paramsSpadesMerging{'minoverlap'} -t $paramsSpadesMerging{'threshold'} $paramsSpadesMerging{'pandaseqMinlen'} $paramsSpadesMerging{'elimination'}  > $correctFile.merged.fastq.gz

	wait

	if ( paramsSpadesMerging{'EnaData'} == 'No') {
		string mergedFile = correctFile + ".merged.fastq.gz"
		sys $path/scripts/recoverSampleNames.sh $mergedFile
	}

	sys mv *.merged.fastq.gz $spaPath
	bayesPath.chdir()
}
wait

# now that we have run pandaseq for merging for all our samples,
# we unzip all my merged files in a new file named "unzip"
# we go to the folder with the output of SPAdes and we make a list with its elements
spaPath.chdir() ;														
string[] merged = spaPath.dir() ;
string unzip = "unzip" ;
# we make a new directory named 'unzip'
unzip.mkdir() ;															
string unzipPath = "$spaPath/unzip" ;
# and we remove the element 'unzip' from the list we made
merged.remove("unzip")										

for ( string anyFile : merged ) {
	sys cp $anyFile $unzipPath
	unzipPath.chdir() ;
	string newFile = anyFile.baseName(".gz")
	sys mv $anyFile $newFile
	spaPath.chdir()
}

wait

print("Merging step by SPAdes is completed" +"\n")

# we make a checkpoint to start our pipeline from the dereplication step!
if ( spaPath.isEmpty() == false ) {
checkpoint "dereplicationStep.chp"
}

parameterFilePath.chdir()
string j = "parameters.tsv"
string{} paramsDereplication =  readParameterFile(j)

##########################################################################################################
#########################     5. DEREPLICATE me ti voitheia ton OBITools !!           ####################
#########################################################################################################
 
unzipPath.chdir() ;
sys if [ $(find "$unzipPath" -name "*.DS_Store*") ] ; then  rm .*[DS_]* ; fi
string[] unzips = unzipPath.dir()

string fileForDerepl
for ( string nodereplicate : unzips ) {
	# now i execute OBITools
	task $path/tools/OBITools-1.2.13/bin/obiuniq -m sample $nodereplicate > $derePath/dereplicate_$nodereplicate ;
	wait
}
wait
print("all the first steps are done! clustering is about to start!" +"\n")
 
checkpoint "tableStep.chp"
parameterFilePath.chdir()
string k = "parameters.tsv"
string{} paramsOfTable =  readParameterFile(k)

##############################################################################################################
#########################################       MERGING ALL FILES IN (1) ONE     ############################
#############################################################################################################

derePath.chdir()
sys if [ $(find "$derePath" -name "*.DS_Store*") ] ; then  rm .*[DS_]* ; fi

# we make a list with all the file names that exist in the folder with the dereplication-step output
string[] dereplicates = derePath.dir()										
 
for ( string derepl : dereplicates ) {
	# and we make some transforms in all these files
	task {
		# we substitue both 'merged_sample...' amd ';' with '_' and nothing respectively
		sys sed 's/ merged_sample={}; count=/_/g ; s/;//g' $derepl > se.$derepl
		# merge all lines of a fastq entry into one and only one line!
		sys awk 'NR==1 {print ; next} {printf /^>/ ? "\n"$0"\n" : $1} END {printf "\n"}' se.$derepl > nospace.$derepl
		# we turn all down- to upcases
		sys cat nospace.$derepl | tr '[a-z]' '[A-Z]' > $linearPath/linearized.$derepl 																		
	}
}
wait

derePath.chdir()
sys rm se.* nospace.*

# we go to the folder with the linearized data and make a list with all its contents  
linearPath.chdir()	  																			
string[] linears = linearPath.dir()
# we remove from these files some characteristic symbols
for ( string sample : linears ) {   																			
	task{
		 sys sed 's/\:\:\:/\:/g' $sample > $sample.f.fasta
		 sys sed 's/\:0\:0\:0\://' $sample.f.fasta > $sample.g.fasta
		}
}
wait
 
linearPath.chdir()
sys rm *.f.fasta
wait

# this command forces applications to use the default language for output, and forces sorting to be bytewise.  
sys export LC_ALL=C 									

######################################################################################################################################
# here I read all the files I have made at once and run over them an awk command in which I set as record separator ">"
# and as field separaton '_' or new line.
# from the second line I make an array with the sample as keys and their abundances as value and anotherone with the otus
# as keys and the sequnece asw value.
# finally, I make a file in which in every line i have the amplicon name (sample) its abundance and its sequence.
######################################################################################################################################

task cat *.g.fasta | awk 'BEGIN {RS = ">" ; FS = "[_\n]"} {if (NR != 1) {abundances[$1] += $2 ; sequences[$1] = $3}} END {for (amplicon in sequences) {print ">" amplicon "_" abundances[amplicon] "_" sequences[amplicon]}}' | sort --temporary-directory=$(pwd) -t "_" -k2,2nr -k1.2,1d | sed -e 's/\_/\n/2' > $outputFilePath/all_samples.fasta

wait	

task awk '/N/{n=2}; n {n--; next}; 1' < $outputFilePath/all_samples.fasta  > $outputFilePath/final_all_samples.fasta

wait
sys rm *.g.fasta
 

# This was added in order to get a contingency table in the proper way.
if ( paramsOfTable{'clusteringAlgoFor16S_18SrRNA'} == "algo_swarm" || paramsOfTable{'clusteringAlgoForCOI_ITS'} == " algo_SWARM") {
	for ( string sample : linears ){
		
		sys sed -i 's/:::/:/g; s/:0:0:0:0./0./g' $sample	
	}
}

######################################################################################################################################
## in the file 'final_all_samples.fasta' we now have all the reads from all our samples
## and we are able to move on CLUSTERING - STEP 				      
## however, even if until now the procedure was the same no matter what gene we had,
## we now have to choose between different paths depending on the gene and our needs 
######################################################################################################################################

# depending on the marker gene and the grouping method selected, we make the right folders
genePath.chdir()																									
if ( paramsOfTable{'gene'} == 'gene_COI' ) {
	string coiPath = genePath + '/' + 'gene_COI'
	coiPath.chdir()
	if ( paramsOfTable{'clusteringAlgoForCOI_ITS'}  == 'algo_CROP' ) {
		string algo = 'CROP'
		algo.mkdir()
	} else if ( paramsOfTable{'clusteringAlgoForCOI_ITS'} == 'algo_SWARM' ) {
		string algo = 'SWARM'
		algo.mkdir()
	}
} else if ( paramsOfTable{'gene'} == 'gene_16S' ) {
	print("my marker gene is 16S! \n")
	string sixteenPath = genePath + '/' + 'gene_16S'
	sixteenPath.chdir()
	
	if ( paramsOfTable{'clusteringAlgoFor16S_18SrRNA'} == 'algo_swarm' ) {
		string algo = 'swarm'
		algo.mkdir()
	} else if ( paramsOfTable{'clusteringAlgoFor16S_18SrRNA'} == 'algo_vsearch' ) {
		string algo = 'vsearch'
		algo.mkdir()
	}
} else if ( paramsOfTable{'gene'} == 'gene_18S' ) {
	print("my marker gene is 18S! \n")
	string sixteenPath = genePath + '/' + 'gene_18S'
	sixteenPath.chdir()
	
	if ( paramsOfTable{'clusteringAlgoFor16S_18SrRNA'} == 'algo_swarm' ) {
		string algo = 'swarm'
		algo.mkdir()
	} else if ( paramsOfTable{'clusteringAlgoFor16S_18SrRNA'} == 'algo_vsearch' ) {
		string algo = 'vsearch'
		algo.mkdir()
	}	
} else if ( paramsOfTable{'gene'} == 'gene_ITS' ) {
	print("my marker gene is ITS! \n")
	string itsPath = genePath + '/' + 'gene_ITS'
	itsPath.chdir()
	if ( paramsOfTable{'clusteringAlgoForCOI_ITS'}  == 'algo_CROP' ) {
		string algo = 'CROP'
		algo.mkdir()
	} else if ( paramsOfTable{'clusteringAlgoForCOI_ITS'} == 'algo_SWARM' ) {
		string algo = 'SWARM'
		algo.mkdir()
	}		
}

###################################################################################################################
#																												  #
#										  	   Clustering     													  #
#																												  #
###################################################################################################################

###################################################################################################################
####################################     16S/18S rRNA marker gene case     ########################################
###################################################################################################################


################   at first, cluster or "cluster" the reads into OTUs or ASVs correspondingly    #################
if  ( paramsOfTable{'gene'} == 'gene_16S' || paramsOfTable{'gene'} == 'gene_18S' ) {

#//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#######################   Here is   "vsearch"   for the  16S marker gene      #####################################
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

	if  ( paramsOfTable{'clusteringAlgoFor16S_18SrRNA'} == 'algo_vsearch' ) {
		
		print " my marker gene is 16S or 18S and I am about to use vsearch to get OTUs \n "
		
		wait
		
		#### Prepare files
		outputFilePath.chdir()
		string vsearchPath16S = genePath + '/' + paramsOfTable{'gene'} +  '/' + 'vsearch'
		task sed 's/_/;size=/g' final_all_samples.fasta > $vsearchPath16S/all_samples.fasta
		
		wait

		######################   here i execute the VSEARCH algorithms    #############################################
				
		vsearchPath16S.chdir()
		
		# demultiplexing
		task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --threads $paramsOfTable{'vsearchThreads'} --derep_fulllength all_samples.fasta --minuniquesize 2 --sizein --sizeout --fasta_width 0 --uc all.derep.uc --output all.derep.fasta
		
		wait
		
		# execute the first clustering step
		task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --threads $paramsOfTable{'vsearchThreads'} --cluster_size all.derep.fasta --id $paramsOfTable{'vsearchId'} --strand both --sizein --sizeout --fasta_width 0 --uc all.preclustered.uc --centroids all.preclustered.fasta
		
		wait

		# chimera removal
		task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --threads $paramsOfTable{'vsearchThreads'} --uchime_denovo all.preclustered.fasta --sizein --sizeout --fasta_width 0 --nonchimeras all.denovo.nonchimeras.fasta
		
		wait

		# change an output file name
		sys mv all.denovo.nonchimeras.fasta all.ref.nonchimeras.fasta
		
		# here are two perl scripts that are located in the "scripts" folder
		task $path/scripts/map_vsearch.pl all.derep.fasta all.preclustered.uc all.ref.nonchimeras.fasta > all.nonchimeras.derep.fasta
		wait
	
		task $path/scripts/map_vsearch.pl all_samples.fasta all.derep.uc all.nonchimeras.derep.fasta > all.nonchimeras.fasta
		wait

		# execute the second clustering step
		task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --threads $paramsOfTable{'vsearchThreads'} --cluster_size all.nonchimeras.fasta --id $paramsOfTable{'vsearchId'} --strand both --sizein --fasta_width 0 --uc all.clustered.uc --relabel Otu --centroids all.otus.fasta --otutabout all.otutab.txt
		
		wait
		
		sys mv all.otus.fasta rna_samples.otus.fa
		sys mv all.otutab.txt rna_otutab.txt
		sys cp 16S_otutab.txt rna_otutab_$paramsOfTable{'taxonomyFolderName'}.txt
		
		wait
		
		print(" vsearch for 16S/18S marker gene is done! \n")

#//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
############################   Here is  "swarm"  for the 16S marker gene    #######################################
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

	} else if ( paramsOfTable{'clusteringAlgoFor16S_18SrRNA'} == 'algo_swarm') {
		
		#### Prepare files
		outputFilePath.chdir()
		string swarmPath16S = genePath + '/' + paramsOfTable{'gene'} +  '/' + 'swarm'
		task cp final_all_samples.fasta $swarmPath16S/rna_SWARM_all_samples.fasta
		wait

		#####  SWARM   ####
		###### we build a contingency table for AMPLICONS  - here the awk.sh file is a bash script that you can find in the PEMA file
		linearPath.chdir()
		
		task{
			sys bash $path/scripts/createContigencyTable.sh
			sys sed 's/linearized.dereplicate_//g; s/.merged.fastq//g' amplicon_contingency_table_temp.csv >  amplicon_contingency_table.csv
			sys mv amplicon_contingency_table.csv  $swarmPath16S
		}
		wait
		print 'the contigency table has been created successfully!' + "\n"
		

		##### Run SWARM and then chimera removal
		swarmPath16S.chdir()
		
		## here is the clustering step with SWARM algorithm 	
		if ( paramsOfTable{'d'} == '1' ) {
			task $path/tools/swarm/swarm/bin/swarm -d 1 -f -b $paramsOfTable{'boundary'} -t $paramsOfTable{'swarmThreads'} -s SWARM.stats -w SWARM_OTU_representatives_all_samples.fasta < rna_SWARM_all_samples.fasta > SWARM.swarms		
		} else {
			task $path/tools/swarm/swarm/bin/swarm -d $paramsOfTable{'d'} -t $paramsOfTable{'swarmThreads'} -s SWARM.stats -w SWARM_OTU_representatives_all_samples.fasta < rna_SWARM_all_samples.fasta > SWARM.swarms
		}
		wait
		print('the swarm algorithm worked just fine!' + "\n")
		
		# and now i handle its output in order to take it in the form i want to. turn lowcase to uppercase and substitute the '_' to 'size='
		task {
			  sys cat SWARM_OTU_representatives_all_samples.fasta | tr '[a-z]' '[A-Z]' > upper.SWARM_OTU_representatives_all_samples.fasta
			  sys sed 's/_/;size=/g' upper.SWARM_OTU_representatives_all_samples.fasta > true.SWARM_OTU_representatives_all_samples.fasta
		}
		wait

		## 'sortbysize' is an algorithm for sorting by size (abundance)
		task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --sortbysize true.SWARM_OTU_representatives_all_samples.fasta --output SWARM_final_OTU_representative.fasta
		wait
		print('sortbysize algorithm ran ok!' + "\n")

		## and the 'uchime3_denovo' is an algorithm for finding chimeras
		task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --uchime3_denovo SWARM_final_OTU_representative.fasta --nonchimeras SWARM_otu_no_chimera.fasta --abskew $paramsOfTable{'abskew'}
		wait
		print('the uchime3_denovo algorithm for finding chimeras just finished!' + "\n")

		string[] swarmInterFiles = swarmPath16S.dir()
		for (string file : swarmInterFiles ) {
			if ( file.startsWith('upper') ) {
				file.delete()
			}
			if  ( file.startsWith('true') ) {
				file.delete()
			}
		}		
		
		##### at last, we create an OTU table that comes after the SWARM results. Again, we have an awk command in a bash script 
		swarmPath16S.chdir()
		# sys mv $linearPath/amplicon_contingency_table_temp.csv $swarmPath16S
		wait
		sys bash $path/scripts/otuContingencyTableSwarm.sh
		print('the contigency table with the found ASVs was just created!' + "\n")
	}
}	


###################################################################################################################
########################        COI & ITS ( MOTU clustering - ASV inference )         #############################
###################################################################################################################


### in the COI case, we will have an IF statement for the two different clustering algos and THEN, after, we will do the taxonomy assignment!!!   

if  ( paramsOfTable{'gene'} == 'gene_COI' || paramsOfTable{'gene'} == "gene_ITS") {
	
	string gene = ''
	string geneName = ''
	
	if ( paramsOfTable{'gene'} == "gene_COI" ) {
		gene = "gene_COI"
		geneName = "COI"
		print 'my marker gene is COI' + "\n"
	} else if ( paramsOfTable{'gene'} == "gene_ITS") {
		gene = "gene_ITS"
		geneName = "ITS"
		print 'my marker gene is ITS' + "\n"
	}
	
	
	## in this case, there is not an algo that does both chimera removal and clustering, hence we have a 2 - step procedure
	## an kai stin arxi eixame pei prota chimera removal kai meta otu clustering, telika pame antistrofa

	#//////////////////////////////////////////////////////////////////////////////////////////////////////////////////	
	###########################  ASVs inference by making use of the  SWARM algorithm   ###############################
	#//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	
	if ( paramsOfTable{'clusteringAlgoForCOI_ITS'} == 'algo_SWARM' ) {
		
		#### Prepare files
		outputFilePath.chdir()
		string swarmPath = genePath + '/' + gene + '/' + 'SWARM'
		string allFiles = geneName + "_SWARM_all_samples.fasta"
	
		sys cp final_all_samples.fasta $swarmPath/$allFiles
		wait
			
		###### we build a contingency table for AMPLICONS  - here the awk.sh file is a bash script that you can find in the PEMA file
		linearPath.chdir()
		
		task{
			sys bash $path/scripts/createContigencyTable.sh
			sys sed 's/linearized.dereplicate_//g; s/.merged.fastq//g' amplicon_contingency_table_temp.csv > amplicon_contingency_table.csv
			sys mv amplicon_contingency_table.csv $swarmPath
		}
		wait
		
		print 'the contigency table has been created successfully!' + "\n"
		
		##### Run SWARM and then chimera removal
		swarmPath.chdir()
		string inputFileForSwarm = geneName + "_SWARM_all_samples.fasta"
		wait
		
		## here is the clustering step with SWARM algorithm 	
		if ( paramsOfTable{'d'} == '1' ) {
			task $path/tools/swarm/swarm/bin/swarm -d 1 -f -t $paramsOfTable{'swarmThreads'} -s SWARM.stats -w SWARM_OTU_representatives_all_samples.fasta < $inputFileForSwarm > SWARM.swarms		
		} else {
			task $path/tools/swarm/swarm/bin/swarm -d $paramsOfTable{'d'} -t $paramsOfTable{'swarmThreads'} -s SWARM.stats -w SWARM_OTU_representatives_all_samples.fasta < $inputFileForSwarm > SWARM.swarms
		}
		wait
		print('the swarm algorithm worked just fine!' + "\n")
		
		# and now i handle its output in order to take it in the form i want to. turn lowcase to uppercase and substitute the '_' to 'size='
		task {
			  sys cat SWARM_OTU_representatives_all_samples.fasta | tr '[a-z]' '[A-Z]' > upper.SWARM_OTU_representatives_all_samples.fasta
			  sys sed 's/_/;size=/g' upper.SWARM_OTU_representatives_all_samples.fasta > true.SWARM_OTU_representatives_all_samples.fasta
		}
		wait
		
		## 'sortbysize' is an algorithm for sorting by size (abundance)
		task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --sortbysize true.SWARM_OTU_representatives_all_samples.fasta --output SWARM_final_OTU_representative.fasta
		wait
		print('sortbysize algorithm ran ok!' + "\n")
		
		## and the 'uchime3_denovo' is an algorithm for finding chimeras
		task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --uchime3_denovo SWARM_final_OTU_representative.fasta --nonchimeras SWARM_otu_no_chimera.fasta --abskew $paramsOfTable{'abskew'}
		wait
		print('the uchime3_denovo algorithm for finding chimeras just finished!' + "\n")

		string[] swarmInterFIles = swarmPath.dir()
		for (string file : swarmInterFIles ) {
			if ( file.startsWith('upper') ) {
				file.delete()
			}
			if  ( file.startsWith('true') ) {
				file.delete()
			}
		}		
		
		##### at last, we create an OTU table that comes after the SWARM results. Again, we have an awk command in a bash script 
		swarmPath.chdir()
		sys mv $linearPath/amplicon_contingency_table_temp.csv $swarmPath
		wait
		# sys bash $path/scripts/createOtuContigencyTable.sh

		# if necessary, remove some not that nice things 
		sys sed -i 's/:::/:/g; s/:0:0:0:0./0./g'  amplicon_contingency_table.csv
		sys bash $path/scripts/otuContingencyTableSwarm.sh
		print('the contigency table with the found otus was just created!' + "\n")
	}
	#### here ends ths "if" statement of SWARM clustering   (   ATTENTION TO HAVE 1 TAB HERE !! )

	#//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	##########################   Clustering to OTUs by making use of the CROP algorithm   #############################
	#//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	
	if ( paramsOfTable{'clusteringAlgoForCOI_ITS'} == 'algo_CROP' ) {
		
		string croPath = genePath + '/' + paramsOfTable{'gene'}  + '/' + 'CROP'
		
		
		####   CROP    #####
		##  In this case we do chimera removal first and then we apply the clustering algorithm  ----  Until now (23/3) we think we prefer CROP over SWARM
		
		outputFilePath.chdir()

		task sed 's/_/;size=/g' final_all_samples.fasta > COI_CROP_all_samples.fasta
		wait
		sys cp COI_CROP_all_samples.fasta $croPath
		wait

		###### we build a contingency table for AMPLICONS  - here the awk.sh file is a bash script that you can find in the PEMA file
		linearPath.chdir()
		
		task{
			sys bash $path/scripts/createContigencyTable.sh
			sys sed 's/linearized.dereplicate_//g; s/.merged.fastq//g' amplicon_contingency_table_temp.csv >  amplicon_contingency_table.csv
			sys  mv  amplicon_contingency_table.csv  $croPath
		}
		wait
		
		print 'the contigency table has been created successfully!' + "\n"

		croPath.chdir()
		
		### chimera removal
		task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --uchime3_denovo COI_CROP_all_samples.fasta --nonchimeras no_chim_COI_CROP_all_samples.fasta --abskew $paramsOfTable{'abskew'}
		wait

		### calculate parameters for CROP clustering algorithm	
		croPath.chdir()
		sys bash $path/scripts/croParameters.sh
		wait
		
		##### at last, we create an OTU table that comes after the CROP results. Again, we have an awk command in a bash script 
		croPath.chdir()
		sys mv $linearPath/amplicon_contingency_table_temp.csv $croPath
		wait
		
		### perform a script that will allow us to use the CROP output as it had SWARM's format
		sys bash $path/scripts/cropOutputTranformations.sh

		# if necessary, remove some not that nice things 
		sys sed -i 's/:::/:/g; s/:0:0:0:0./0./g'  amplicon_contingency_table.csv
		sys bash $path/scripts/otuContingencyTableSwarm.sh
		
		# The file returned is called "SWARM_OTU_table.tsv" just to have a sole script for this task for both the cases of Swarm and Crop. It is exactly the same approach.
		sys mv SWARM_OTU_table.tsv CROP_OTU_table.tsv
		print 'the contigency table with the found otus was just created!' + "\n"
		print 'The clustering of the MOTUs by the CROP algorithm, has been concluded!'  + "\n"
	}
}

#############################################################################################################################
#																															#
#												Taxonomy Assignment 														#
#																															#
##############################################################################################################################

checkpoint "taxonomyAssignment.chp"
parameterFilePath.chdir()
string l = "parameters.tsv"
string{} paramsOfTaxonomy =  readParameterFile(l)

#######################################################################################################
##############################     Taxonomy Assignment 16S     ########################################
#######################################################################################################

### for the taxonomy assignment in the case of the 16S marker gene, we have 2 options (alignment & phylogeny based)
### hence, PEMA performs what the user chose, according to the clustering algorithm setting as well

### taxonomy assignment using CREST and SILVA database  (alignment based) for the 16S marker gene   ###

if ( ( paramsOfTaxonomy{'gene'} == 'gene_16S' || paramsOfTaxonomy{'gene'} == 'gene_18S' ) && paramsOfTaxonomy{'taxonomyAssignmentMethod'} == 'alignment' ) {
	
	string assignmentPath16S = ''
	
	if ( paramsOfTaxonomy{'clusteringAlgoFor16S_18SrRNA'} == "algo_vsearh" ) {
		string assignmentPath16S = genePath + '/' + paramsOfTaxonomy{'gene'} +  '/' + 'vsearch'
		print("assignmentPath16S is : " + assignmentPath16S + "\n")
		assignmentPath16S.chdir()
				
	} else if ( paramsOfTaxonomy{'clusteringAlgoFor16S_18SrRNA'} == "algo_swarm" ) {		
		string assignmentPath16S = genePath + '/' + paramsOfTaxonomy{'gene'} +  '/' + 'swarm'
		assignmentPath16S.chdir()
		
		# run a script that will make all the changes needed for the SWARM output to run with LCAClassifier
		task bash $path/scripts/transformationsOnTable.sh
		
		wait
		
		sys mv SWARM_OTU_table.tsv rna_otutab_$paramsOfTaxonomy{'taxonomyFolderName'}.txt 
		
		## Singletons may be removed or not
		if ( paramsOfTaxonomy{'removeSingletons'} == 'Yes' ){
			sys bash $path/scripts/removeSingletons.sh
			wait
			sys bash $path/scripts/fromLinearToMultiLinesFasta.sh
			wait
			print('No singletons file is fine' + "\n")		
		}
		###
		
		sys mv SWARM_otu_no_chimera.fasta rna_all_samples.otus.fa
		sys sed -i 's/;size=[0-9]*//g' rna_all_samples.otus.fa
		
	}
		
	wait

	if ( paramsOfTaxonomy{'silvaVersion'} == 'silva_128' ) {
		
		string algoCluster1 = paramsOfTaxonomy{'clusteringAlgoFor16S_18SrRNA'}
		string algo1 = algoCluster1.split('_')[1]
		string assignmentPath16S2 = genePath + '/' + paramsOfTaxonomy{'gene'} +  '/' + algo1
		assignmentPath16S2.chdir()
		# first we use megablast algorithm of blastn for sequence identification, intra-species comparison 						
		task $path/tools/ncbi-blast-2.8.1+/bin/blastn -task megablast -query rna_all_samples.otus.fa -db $path/tools/CREST/LCAClassifier/parts/flatdb/silvamod/silvamod128.fasta -num_alignments 100 -outfmt 5 -out rna_mysilvamod128_$paramsOfTaxonomy{'taxonomyFolderName'}.xml -num_threads $paramsOfTaxonomy{'vsearchThreads'}
		
		wait
		
		# and then we use the classifier in order to get our assignment
		task $path/tools/CREST/LCAClassifier/bin/classify -t rna_otutab_$paramsOfTaxonomy{'taxonomyFolderName'}.txt -o $paramsOfTaxonomy{'taxonomyFolderName'} rna_mysilvamod128_$paramsOfTaxonomy{'taxonomyFolderName'}.xml
		
		wait
		
	# we follow the same steps for the case of the Silva 132 	
	} else if ( paramsOfTaxonomy{'silvaVersion'} == 'silva_132' ) {
		
		string algoCluster1 = paramsOfTaxonomy{'clusteringAlgoFor16S_18SrRNA'}
		string algo1 = algoCluster1.split('_')[1]
		string assignmentPath16S2 = genePath + '/' + paramsOfTaxonomy{'gene'} +  '/' + algo1
		assignmentPath16S2.chdir()
	
		task $path/tools/ncbi-blast-2.8.1+/bin/blastn -task megablast -query rna_all_samples.otus.fa -db $path/tools/CREST/LCAClassifier/parts/flatdb/silva132/silva132 -num_alignments 100 -outfmt 5 -out rna_mysilvamod132_$paramsOfTaxonomy{'taxonomyFolderName'}.xml -num_threads $paramsOfTaxonomy{'vsearchThreads'}
		
		wait
		
		task $path/tools/CREST/LCAClassifier/bin/classify -c $path/tools/CREST/LCAClassifier/parts/etc/lcaclassifier.conf -d silva132 -t rna_otutab_$paramsOfTaxonomy{'taxonomyFolderName'}.txt -o $paramsOfTaxonomy{'taxonomyFolderName'} rna_mysilvamod132_$paramsOfTaxonomy{'taxonomyFolderName'}.xml
		
		wait		
	}

	if ( paramsOfTaxonomy{'clusteringAlgoFor16S_18SrRNA'} == "algo_vsearch" ) {
		string taxonomyDirectory = genePath + '/' + paramsOfTaxonomy{'gene'} +  '/' + 'vsearch' + '/' + paramsOfTaxonomy{'taxonomyFolderName'}
		print("taxononomyDirectory is :   " + taxonomyDirectory + "\n")
		taxonomyDirectory.chdir()
		sys cp rna_otutab_* final_table.tsv
				
	} else if ( paramsOfTaxonomy{'clusteringAlgoFor16S_18SrRNA'} == "algo_swarm" ) {
		string taxonomyDirectory = genePath + '/' + paramsOfTaxonomy{'gene'} +  '/' + 'swarm' + '/' + paramsOfTaxonomy{'taxonomyFolderName'}
		taxonomyDirectory.chdir()
		sys cp rna_otutab_* final_table.tsv
	}

####################################################################################################
## what about another way for taxonomy assignment for the case of the 16S marker gene?             #
## here is the phylogeny-based approach that PEMA supports,  using RAxML-ng and SILVA (v.128)      #
####################################################################################################

} else if ( paramsOfTaxonomy{'gene'} == 'gene_16S' && paramsOfTaxonomy{'taxonomyAssignmentMethod'} == 'phylogeny' ) {
	
	## we need a REFERENCE TREE which we generated it from SILVA database with the aid of GAPPA.
	## we used GAPPA in order to keep only 10.000 taxa from SILVA.
	## we then used RAXML - NG which it returnerd us our REF TREE - you can find the corresponding script for
	## getting this tree in  $path/tools/silva_132/for_placement
	## in order to get our PHYLOGENY based taxonomy assignment we make use of EPA - NG // last version of Evolutionary Placement Algorithm
	
	string assignmentPath16S = ''

	if ( paramsOfTaxonomy{'clusteringAlgoFor16S_18SrRNA'} == "algo_vsearh" ) {
		string assignmentPath16S = genePath + '/' + 'gene_16S' +  '/' + "vsearch"
		
	} else if ( paramsOfTaxonomy{'clusteringAlgoFor16S_18SrRNA'} == "algo_swarm" ) {
		string assignmentPath16S = genePath + '/' + 'gene_16S' +  '/' + 'swarm'
		assignmentPath16S.chdir()
		sys cp SWARM_otu_no_chimera.fasta 16S_all_samples.otus.fa
	}
	
	wait
	
	assignmentPath16S.chdir()
	
	# PaPaRa returns a msa of our queries that has taken into account info from our ref tree 
	task $path/tools/PaPaRa/papara_static_x86_64 -t $path/tools/createTreeTheEasyWay/raxml_easy_right_refmsa.raxml.bestTree -s $path/tools/createTreeTheEasyWay/raxml_easy_right_refmsa.raxml.reduced.phy -q 16S_all_samples.otus.fa -r  #-j 4
	
	wait
	
	# we remove the ref - msa - THIS IS A BUG OF EPA-ng !!!!
	sys tail -n +1038 papara_alignment.default > papara_alignment.phy
	wait

	# we convert our query msa to fasta format with a script we made because epa-ng can handle only fasta
	assignmentPath16S.chdir()
	task bash $path/scripts/convertPhylipToFasta.sh  papara_alignment.phy
	
	wait
			
	# execute epa-ng algorithm (evolutinary placement) in order to get our phylogenetic-based taxonomy assignment, at last!
	task $path/tools/EPA-ng/epa/bin/epa-ng -s $path/tools/createTreeTheEasyWay/raxml_easy_right_refmsa.raxml.reduced.phy.fasta -t $path/tools/createTreeTheEasyWay/raxml_easy_right_refmsa.raxml.bestTree -q papara_alignment.fasta --model GTR{0.959447/2.52497/1.45842/1.06213/3.55143/1}+FU{0.237221/0.230211/0.294337/0.238231}+G4m{0.559838} # -w <output_dir>
	
	wait
	print("phylogenetic based taxonomy assignemnt worked fine!")

	string assignFolder="16S_taxon_assign_phylogeny_assignment"
	assignmentPath16S.chdir()
	assignFolder.mkdir()
	sys mv epa* $assignFolder
}

#######################################################################################################
###################     Taxonomy assignment for the COI and the ITS marker genes    ###################
#######################################################################################################


#\\\\\\\\\\\\\\ \\\\\\\\      When SWARM has been used for ASVs inferring  /////////////////////////////


if ( ( paramsOfTaxonomy{'gene'} == 'gene_COI' || paramsOfTaxonomy{'gene'} == "gene_ITS") &&  paramsOfTaxonomy{'clusteringAlgoForCOI_ITS'} == 'algo_SWARM' ) {
	
	string gene = ''
	if ( paramsOfTaxonomy{'gene'} == "gene_COI" ) {
		gene = "gene_COI"
	} else {
		gene = "gene_ITS"
	}
	
	string swarmPath = genePath + '/' + gene +  '/' + 'SWARM'
	swarmPath.chdir()

	if ( paramsOfTaxonomy{'gene'} == "gene_COI" ) {

		### I remove all  new lines and make each sequence an one and only line kai kano oli tin allilouxia mia grammi
		task awk 'NR==1 {print ; next} {printf /^>/ ? "\n"$0"\n" : $1} END {printf "\n"}' SWARM_otu_no_chimera.fasta > nospace_swarm.fasta
		wait
		print('nospace_swarm is fine!' +"\n")
	
		### Remove all singletons in case the user wants
		if ( paramsOfTaxonomy{'removeSingletons'} == 'Yes' ) {
			sys bash $path/scripts/removeSingletons.sh
			wait
			sys cp no_singletons.fasta rdpClassifierInput.fasta
			print('No singletons file is fine' + "\n")
			
		} else {
			sys cp SWARM_otu_no_chimera.fasta rdpClassifierInput.fasta
		}
	
		### I run RDPClassifier - train - step has already be done and as reference database we use MIDORI
		
		##  need to check on this... try this "-f filterbyconf: only outputs the results for major ranks as in fixrank,
		##  results below the confidence cutoff were bin to a higher rank unclassified_node  db: outputs the seqname, trainset_no, tax_id, conf.
		
		task java -Xmx64g -jar $path/tools/RDPTools/classifier.jar classify -t $path/tools/RDPTools/TRAIN/rRNAClassifier.properties -o tax_assign_swarm_COI_temp.txt rdpClassifierInput.fasta
		
		wait
	
		task{
			sys sed 's/.*root\t1.0\t//g' tax_assign_swarm_COI_temp.txt > taxonomies.txt
			sys awk '{print $1}' tax_assign_swarm_COI_temp.txt > ids.txt
			sys paste ids.txt taxonomies.txt > tax_assign_swarm_COI.txt
			sys rm ids.txt taxonomies.txt tax_assign_swarm_COI_temp.txt
		}
		print 'rdpclassifier has ran as it should ' + "\n"
		
		swarmPath.chdir()
		string noSpaceFile = 'nospace_swarm.fasta'

		noSpaceFile.delete()
		wait
		
		## we keep in a new file, only the otus that have above  97% similartity with some species and in another just the uniq species
		swarmPath.chdir()
		#sys bash $path/scripts/keepOnlyPerCentSimilartityForSwarm.sh
		print 'all the taxonomy assignment procedure for the SWARM case has been completed!'  + "\n"
		
		
		# Create a MOTU table as for the case of 16S
		# We need to change the swarm output as we have removed some amplicons after the ASV inferring because of the chimera removal
		task {
			sys bash $path/scripts/makeMotuTableForRDPClassifier.sh
			sys rm *_sorted almost redi out assigned true_Swarm_otu_table.tsv
		}
		
###   and now the big "IF" for the case of ITS
###   we will use the CREST LCAClassifier for that.

	} else if ( paramsOfTaxonomy{'gene'} == "gene_ITS" ) {
			
		swarmPath.chdir()
		
		# run a script that will make all the changes needed for the SWARM output to run with LCAClassifier
		task bash $path/scripts/transformationsOnTable.sh
		
		wait

		sys mv SWARM_OTU_table.tsv ITS_otutab_$paramsOfTaxonomy{'taxonomyFolderName'}.txt 
		sys mv SWARM_otu_no_chimera.fasta ITS_all_samples.otus.fa
		sys sed -i 's/;size=[0-9]*//g' ITS_all_samples.otus.fa

		
		task $path/tools/ncbi-blast-2.8.1+/bin/blastn -task megablast -query ITS_all_samples.otus.fa -db $path/tools/CREST/LCAClassifier/parts/flatdb/unite/unite.fasta -num_alignments 100 -outfmt 5 -out ITS_myunitemod.xml -num_threads $paramsOfTaxonomy{'vsearchThreads'}

		wait
			
		task $path/tools/CREST/LCAClassifier/bin/classify -c $path/tools/CREST/LCAClassifier/parts/etc/lcaclassifier.conf -d unite -t ITS_otutab_$paramsOfTaxonomy{'taxonomyFolderName'}.txt -o $paramsOfTaxonomy{'taxonomyFolderName'} ITS_myunitemod.xml
		
		wait
		
		string taxonomyDirectoryITS = genePath + '/' + paramsOfTaxonomy{'gene'} +  '/' + 'SWARM' + '/' + paramsOfTaxonomy{'taxonomyFolderName'}
		taxonomyDirectoryITS.chdir()
		cp ITS_otutab_* final_table.tsv
	}		
}

wait

#\\\\\\\\\\\\\\ \\\\\\\\      When CROP has been used for OTUs clustering  /////////////////////////////

if  ( paramsOfTaxonomy{'gene'} == 'gene_COI' &&  paramsOfTaxonomy{'clusteringAlgoForCOI_ITS'} == 'algo_CROP' ) {
	
	string croPath = genePath + '/' + 'gene_COI'  + '/' + 'CROP'
	croPath.chdir()
	
	print 'The taxonomy assignment with the CROP output has STARTED! '
	
	## edo thelo na ftiaxo ena arxeio sto opoio tha exo kai tin allilouxia tou otu kai to abundance
	task {
		  sys cat CROP_output.cluster.fasta | grep ">" > heads.txt
		  sys sed 's/=[0-9]*/=/g' heads.txt > heads2.txt
		  sys cat CROP_output.cluster.fasta | grep -v ">"  > seqs.fasta
		  sys awk '{print $2}' CROP_output.cluster > abundancies.txt
		  sys tail -n +2 abundancies.txt > abundancies2.txt
		  sys paste heads2.txt abundancies2.txt > new_heads.txt
		  sys sed 's/\t//g' new_heads.txt > final_heads.txt
		  sys paste -d'\n' final_heads.txt seqs.fasta > crop_for_rdpclassifier.fasta
		  sys rm heads.txt heads2.txt seqs.fasta abundancies.txt abundancies2.txt new_heads.txt final_heads.txt
	}
	
	print 'I have made the file I need for the taxonomy assignment' + "\n"
	
	## ki edo pleon trexo ton RDPClassifier gia to diamorfomeno mou arxeio
	croPath.chdir()
	task java -Xmx64g -jar $path/tools/RDPTools/classifier.jar classify -t $path/tools/RDPTools/TRAIN/rRNAClassifier.properties -o taxonomy_from_rdpclassifer_COI_CROP.txt crop_for_rdpclassifier.fasta
	
	
	croPath.chdir()
	sys bash $path/scripts/keepOnlyPerCentSimilartityForCrop.sh
	
} else if ( paramsOfTaxonomy{'gene'} == 'gene_ITS'  &&  paramsOfTaxonomy{'clusteringAlgoForCOI_ITS'} == 'algo_CROP') {
	
	string croPath = genePath + '/' + 'gene_ITS'  + '/' + 'CROP'
	croPath.chdir()
	
	# run a script that will make all the changes needed for the SWARM output to run with LCAClassifier
	
	sys mv CROP_OTU_table.tsv SWARM_OTU_table.tsv
	task bash $path/scripts/transformationsOnTable.sh

	wait
	sys mv SWARM_OTU_table.tsv ITS_otutab_$paramsOfTaxonomy{'taxonomyFolderName'}.txt 
	sys mv CROP_output.cluster.fasta ITS_all_samples.otus.fa
	sys sed -i 's/;size=[0-9]*//g' ITS_all_samples.otus.fa

	
	task $path/tools/ncbi-blast-2.8.1+/bin/blastn -task megablast -query ITS_all_samples.otus.fa -db $path/tools/CREST/LCAClassifier/parts/flatdb/unite/unite.fasta -num_alignments 100 -outfmt 5 -out ITS_myunitemod.xml -num_threads $paramsOfTaxonomy{'vsearchThreads'}

	wait
		
	task $path/tools/CREST/LCAClassifier/bin/classify -c $path/tools/CREST/LCAClassifier/parts/etc/lcaclassifier.conf -d unite -t ITS_otutab_$paramsOfTaxonomy{'taxonomyFolderName'}.txt -o $paramsOfTaxonomy{'taxonomyFolderName'} ITS_myunitemod.xml
	
	wait
	
	string taxonomyDirectoryITS = genePath + '/'+ paramsOfTaxonomy{'gene'} + '/' + 'CROP' + '/' +paramsOfTaxonomy{'taxonomyFolderName'}
	taxonomyDirectoryITS.chdir()
	sys cp ITS_otutab_* final_table.tsv		
}


########################################################################################################################
#############    here starts the 'makeOtuTable' script    ...    take as much information as possible   ################
########################################################################################################################

string fileFromRdpClassifier
string textFinal
string perSampleTriples

# make as variables the files that will be used and created, depending on the clustering algo
if (  paramsOfTaxonomy{'gene'} == 'gene_COI'  && paramsOfTaxonomy{'clusteringAlgoForCOI_ITS'} == 'algo_SWARM' ) {
	string swarmPath = genePath + '/' + 'gene_COI' +  '/' + 'SWARM'
	fileFromRdpClassifier = 'tax_assign_swarm_COI.txt'
	textFinal = 'finalTriplesFromSwarmClustering.txt'
	perSampleTriples = 'finalTriplesPerSampleFromSwarmClustering.txt'
	swarmPath.chdir()
} else if ( paramsOfTaxonomy{'gene'} == 'gene_COI'  && paramsOfTaxonomy{'clusteringAlgoForCOI_ITS'} == 'algo_CROP' ) {
	string croPath = genePath + '/' + 'gene_COI' +  '/' + 'CROP'
	fileFromRdpClassifier = 'taxonomy_from_rdpclassifer_COI_CROP.txt'
	textFinal = 'finalTriplesFromCropClustering.txt'
	perSampleTriples = 'finalTriplesPerSampleFromCropClustering.txt'
	croPath.chdir()
}
wait

# we check whether the output of the classifier is non-zero
if (  paramsOfTaxonomy{'gene'} == 'gene_COI'  && fileFromRdpClassifier.size() != 0 ) {
	print ' we have a RDPClassifier output file to work with!!! '
	bool debugCode = false
	
	string{} giveSampleAndTaxonomyGetSizeSum
	string{} sampleIdentifiers 			#set of all samples encountered
	string{} taxaNames					#set of all assigned taxa
	
#####   from output file from both SWARM and CROP clustering  algorithms and RDPClassifier afterwards,
#####   I get an 'assignment.txt' file, from which  i get all the information i need   
#####   here, i keep only the taxonomy that is from a threshold and above ( e.g >97%) and i merge all otus
#####   with the same taxonomy in the same sample in one entry
#####   finally, i sum up the sizes of the otus i merged
	
	string[] lines = fileFromRdpClassifier.readLines()
	for ( string line : lines ) {
	# input file looks like:
	#ERR1308250:22628;size=4349_TAB__TAB_root_TAB_root_TAB_1Eukaryota	superkingdom	1	Arthropoda	phylum...
	#0	1	2	3	4	5	6	7	8	9	10...
	#PARSE	SKIP	SKIP	SKIP	PARSE	PARSE	SKIP	PARSE	PARSE	SKIP...
	
		print line + "\n"
		string[] lineElements = line.split("\t")	   
		string sample = lineElements[0].split(':')[0]											#parse 1st column (index 0), get first component
		string sizeAsString = lineElements[0].split('=')[1]										#parse 1st column (index 0), get last component
		int size = sizeAsString.parseInt()

		if ( debugCode ) {
			print "TEST1\t "+ sample + "\t" + size + "\n"
		}

		string thereIsMoreToRead = 'true'

		#current triplet refers to e.g. "Eukaryota	superkingdom	1" columns with index 1,2,3
		int currentTripletStartColumn = 1 																#start parsing now from the 2nd column, ie. with index 1
		string assigned_taxo=""
		string assignmentConfidence=0

		while ( thereIsMoreToRead == 'true' ) {
			
			assigned_taxo = lineElements[currentTripletStartColumn]										#read column with index 5, 8, 11...
			
			if ( debugCode ) {
				print "t1:" + currentTripletStartColumn + ":" + assigned_taxo + "\n"
			}

			#columns with index 6, 9, 12,... are skipped
			#read column with index 7, 10, 13...
			assignmentConfidence = lineElements[currentTripletStartColumn + 2]
			print "the assigned taxo is " + assigned_taxo + "\n"
			print "assignmentConfidence is " + assignmentConfidence + "\n"
			print "size is equal to: " + size + "\n"
			if ( debugCode ) {
				print "t2:" + currentTripletStartColumn +":" +assignmentConfidence + "\n"
			}
			#if no next taxon triple follows return 
			int nextTripletConfidenceColumn = currentTripletStartColumn + 5 
			if ( debugCode ) {
				print "t3:" + currentTripletStartColumn + " index out of max "+ lineElements.size() - 1 +"\n"
			}
			#inspect column with index 8, 11, 14... for existence
			if ( nextTripletConfidenceColumn > lineElements.size() ) { 
				#the whole table has been read and the species level reached
				if ( debugCode ) {
					print "t4: column with index " + nextTripletConfidenceColumn + " does not exist\n"
				}
				thereIsMoreToRead = 'false'
				continue;
			}
			#if the confience of the next taxon is below confidence return
			if ( lineElements[nextTripletConfidenceColumn] < 0.97 ) {
				if ( debugCode ) {
					print "t5:" + nextTripletConfidenceColumn + "\n"
				}
				thereIsMoreToRead = 'false'
				continue;
			}

			currentTripletStartColumn = currentTripletStartColumn + 3 #move to next taxon triple

		} #end of while to read taxa and confidence levels
		
		if ( debugCode ) {
			print "TEST2\t "+ sample + "\t" + assigned_taxo + "\t" + assignmentConfidence + "\t" + size + "\n"
		}
	
	#######################     till here, my code finds the taxon level   ####################################
			
		sampleIdentifiers { sample } = 1;
		taxaNames { assigned_taxo } = 1;
		
		wait
			
		#sum size to any previous size sum corresponding to SampleAndTaxonomy pair
		if ( giveSampleAndTaxonomyGetSizeSum.hasKey(sample+'_SEP_'+assigned_taxo ) == 'true') {
			int newSum = giveSampleAndTaxonomyGetSizeSum { sample+'_SEP_'+assigned_taxo }.parseInt()
			print("size is " + size)
			newSum += size
			print("new sum is " + newSum)
			giveSampleAndTaxonomyGetSizeSum { sample+'_SEP_'+assigned_taxo } = newSum
		} else {
			giveSampleAndTaxonomyGetSizeSum { sample+'_SEP_'+assigned_taxo } =  size
		}
		
		if ( debugCode ) {
			print "TEST3\t"+ sample+'_'+assigned_taxo + "-" + giveSampleAndTaxonomyGetSizeSum{ sample+'_'+assigned_taxo }  +"\n"
		}
	} #end of each input file line
	
	wait
	
	if ( debugCode ) {
		print giveSampleAndTaxonomyGetSizeSum
	}
	
	#print output based on the giveSampleAndTaxonomyGetSizeSum map
	string sampleToPrint
	string sizeSumToPrint
	string taxonomyToPrint
	wait
	for(string sampleAndTaxon : giveSampleAndTaxonomyGetSizeSum.keys() ) {    
		sampleToPrint =  sampleAndTaxon.split('_SEP_')[0]
		sizeSumToPrint =  giveSampleAndTaxonomyGetSizeSum{ sampleAndTaxon }
		taxonomyToPrint =  sampleAndTaxon.split('_SEP_')[1]
		sys echo '$sampleToPrint    $sizeSumToPrint    $taxonomyToPrint ' >>  $textFinal
		
	}
	
	if ( debugCode ) {
		print taxaNames
		print sampleIdentifiers
	}
	
	## double for-loop with all the sample id and taxa name possible combinations printing the real ones,
	## the ones that have a value that is
	string sizeToPrintPerSample
	for (string sampleId : sampleIdentifiers.keys()) {
		for ( string currentTaxon : taxaNames.keys() ){
			
			## print only for the sample and taxonomy pairs that have a size
			if ( giveSampleAndTaxonomyGetSizeSum.hasKey( sampleId+"_SEP_"+currentTaxon)) {
				sizeToPrintPerSample = giveSampleAndTaxonomyGetSizeSum { sampleId+"_SEP_"+currentTaxon }
				sys echo '$sampleId    $sizeToPrintPerSample    $currentTaxon' >>  $sampleId.$perSampleTriples
			}
		}
	}
	print 'i did just find with the tripltes! ' + "\n"
	wait
	
	# finaly, make an OTU table only with the significant assignments (>97%)
	if ( paramsOfTaxonomy{'clusteringAlgoForCOI_ITS'} == 'algo_SWARM' ) {
		string swarmPath = genePath + '/' + 'gene_COI' +  '/' + 'SWARM'
		swarmPath.chdir()
		sys bash $path/scripts/createOtuTableOnlyForTaxonomyAssigned.sh		
	}
	
	if ( paramsOfTaxonomy {'clusteringAlgoForCOI_ITS'} == 'algo_CROP') {
		string cropPath = genePath + '/' + 'gene_COI' +  '/' + 'CROP'
		cropPath.chdir()
		sys bash $path/scripts/createOtuTableOnlyForTaxonomyAssigned.sh
	}

}

print ' At last! We now have the MOTU-table! :) ' + "\n"
wait
###################################     here ends 'make0tuTable' script  !!!    ###############################################

#////////////////////////////////////////////////////////////////////////////////////////////////////////

##############################################################################################################################
####  now, i have everything i need to, so I make files per sample in which i keep only the info I consider as valuable ! ####
##############################################################################################################################

bayesPath.chdir()
string[] names = bayesPath.dir()
outputPerSamplePath.chdir()

# for each sample we make a folder with its name, in order to put PEMA's output concerning each of those
for ( string sample : names ) { 																						
    sample.mkdir()
}

fastqcPath.chdir()
# in 'samples' we make a list with all file names we find in fastqc output folder - that means for all the files
string[] samples = fastqcPath.dir()																		

# we remove all .html files from our list
for (string file : samples ) { 																							
	if ( file.endsWith(".html") ) {
		samples.remove(file)
	}
}

#set variables 
string unzipfq 
string sampleNoZip
fastqcPath.chdir()

for ( string sample : samples ) {
	
	# we unzip the output of fastqc for each file
	task unzip $fastqcPath/$sample  																			
	sampleNoZip = sample.split('.zip')[0]
	unzipfq = sample.split('_[0-9]_fastqc')[0]
	wait
	
	# we copy two significant output files of FastQC to the sample's output folder
	task{
		sys cp $fastqcPath/$sampleNoZip/Images/per_base_quality.png  $outputPerSamplePath/$unzipfq
		sys cp $fastqcPath/$sampleNoZip/summary.txt $outputPerSamplePath/$unzipfq
	} 								
	wait
	unzipfq = ''
}
wait

########################################################################################################
############      making output per sample for 16S marker gene & alignment based assignment   ##########
#########################################################################################################

genePath.chdir()
if  ( ( paramsOfTaxonomy{'gene'} == 'gene_16S' || paramsOfTaxonomy{'gene'} == 'gene_18S' )  &&  paramsOfTaxonomy{'taxonomyAssignmentMethod'} == 'alignment') {
	
	string assignmentPathRNA
	
	if ( paramsOfTaxonomy{'clusteringAlgoFor16S_18SrRNA'} == 'algo_vsearch' ) {
		assignmentPathRNA = genePath + '/' + paramsOfTaxonomy{'gene'} + '/' + 'vsearch' + '/'  + paramsOfTaxonomy{'taxonomyFolderName'}
	} else {
		assignmentPathRNA = genePath + '/' + paramsOfTaxonomy{'gene'} + '/' + 'swarm' + '/'  + paramsOfTaxonomy{'taxonomyFolderName'}
	}
	assignmentPathRNA.chdir()
	
	task {		
	sys sed "s/ /''/g" Richness.tsv > Richness2.tsv 
	sys sed "s/ /''/g" All_Cumulative.tsv > All_Cumulative2.tsv 
	sys sed "s/ /''/g" Relative_Abundance.tsv > Relative_Abundance2.tsv 
	}
wait

for ( string sample : names ) {

	wait
	sys awk '{print $2, $3, $sample}' Richness2.tsv > Richneness_$sample.tsv
	sys awk '{print $2, $3, $sample}' All_Cumulative2.tsv > All_Cumulative_$sample.tsv
	sys awk '{print $2, $3, $sample}' Relative_Abundance2.tsv > Relative_Abundance_$sample.tsv
	sys awk '{print $sample}' All_Cumulative2.tsv > All_Cumulative_only_int_$sample.tsv
	sys awk '{print $sample}' Relative_Abundance2.tsv > Relative_Abundance_only_int_$sample.tsv
	wait
	
	sys paste -d ' ' Richneness_$sample.tsv All_Cumulative_only_int_$sample.tsv Relative_Abundance_only_int_$sample.tsv > profile_$sample.tsv
	wait
	sys rm All_Cumulative_only_int_$sample.tsv Relative_Abundance_only_int_$sample.tsv
	wait
	sys mv profile_$sample.tsv Richneness_$sample.tsv  All_Cumulative_$sample.tsv  Relative_Abundance_$sample.tsv $outputPerSamplePath/$sample
}

sys rm Richness2.tsv All_Cumulative2.tsv   Relative_Abundance2.tsv 
}

###########################################################################################
##################      making output per sample for COI marker gene     ##################
###########################################################################################

if  ( paramsOfTaxonomy{'gene'} == 'gene_COI' &&  paramsOfTaxonomy{'clusteringAlgoForCOI_ITS'} == 'algo_SWARM' ){
	 #using string.readLines() i will create my otu table..
	string swarmPath = genePath + '/' + 'gene_COI' +  '/' + 'SWARM'
	print swarmPath + "\n"
	swarmPath.chdir()
	string[] allSamplesList
	string[] allContainsOfSwarm = swarmPath.dir()
	for ( string sample : allContainsOfSwarm ) {
		print "sample is " + sample + "\n"
		if ( sample.endsWith('.finalTriplesPerSampleFromSwarmClustering.txt') == true  ) {
			string sampleId = sample.split('.finalTriplesPerSampleFromSwarmClustering.txt')[0]
			print "sample is " + sampleId + "\n"
			sys mv $sample $outputPerSamplePath/$sampleId
		}
	}
	# here, the number of the SPECIES that are found from all samples, are computed
	sys bash $path/scripts/uniqSpecies.sh
}		

if  ( paramsOfTaxonomy{'gene'} == 'gene_COI' &&  paramsOfTaxonomy{'clusteringAlgoForCOI_ITS'} == 'algo_CROP' ){
	print ' EDO EINAI POU XOUME TO KENAKI MAS TO TELEUTAIO.....' +"\n"
	string cropPath = genePath + '/' + 'gene_COI' +  '/' + 'CROP'
	print cropPath + "\n"
	cropPath.chdir()
	string[] allSamplesList
	string[] allContainsOfCrop = cropPath.dir()
	for ( string sample : allContainsOfCrop ) {
		print "sample is " + sample + "\n"
		if ( sample.endsWith('.finalTriplesFromCropClustering.txt') == true  ) {
			string sampleId = sample.split('.finalTriplesFromCropClustering.txt')[0]
			print "sample is " + sampleId + "\n"
			sys mv $sample $outputPerSamplePath/$sampleId
		}
	}
	
	# here, the number of the SPECIES that are found from all samples, are computed
	#using string.readLines() i will create my otu table..  
	#string.readLines() 
}		


checkpoint "phyloseq.chp"
parameterFilePath.chdir()
string m = "parameters.tsv"
string{} paramForPhyloseq = readParameterFile(m)


#################################################################################################
#																								#
#								    phyloseq time!!!     										#
#																								#
#################################################################################################

# In order to get a phyloseq analysis, a phylogenetic tree for the OTUs found, needs to be made. 
# For that purpose, PEMA creates a Multiple Sequence Alignment (MSA) using the MAFFT algorithm
# Then, using RAxML the tree is being built.

# Finally, the user has to provide PEMA with a "metadata.csv" file. This needs to be in the analysis folder, along with parameters.tsv

# Pay attention!
# You need to fulfill the variable with the silva version, into the "phyloseq_in_PEMA.R" file as well, in order to get the phyloseq analysis. 

if ( paramForPhyloseq{'taxonomyAssignmentMethod'} == 'alignment' &&  paramForPhyloseq{'phyloseq'} == 'Yes' ) {
	
	string phyloseq_folder = 'phyloseq_output'

	outputFilePath.chdir()
	# check if phyloseq-output folder has already been made from previous run
	if ( phyloseq_folder.isDir() ) {
		print 'a phyloseq-output folder has already been built\n'
		wait
	} else {
		# create a folder where the input/output of phyloseq will be located
		phyloseq_folder.mkdir()
	}	
	
	# set the assignmentPath variable according to the marker gene chosen
	string assignmentPath
	if ( paramForPhyloseq{'gene'} == 'gene_ITS' ) {
		string algoCluster = paramForPhyloseq{'clusteringAlgoForCOI_ITS'}
		string algo = algoCluster.split('_')[1]
		assignmentPath = genePath + '/' + paramForPhyloseq{'gene'}  + '/' + algo + '/' + paramForPhyloseq{'taxonomyFolderName'}
	} else if ( paramForPhyloseq{'gene'} == 'gene_COI' ) {
		string algoCluster = paramForPhyloseq{'clusteringAlgoForCOI_ITS'}
		string algo = algoCluster.split('_')[1]
		assignmentPath = genePath + '/' + paramForPhyloseq{'gene'}  + '/' + algo
		assignmentPath.chdir()
		sys bash $path/scripts/makePhyloseqForCoiCase.sh
	} else {
		string algoCluster = paramForPhyloseq{'clusteringAlgoFor16S_18SrRNA'}
		string algo = algoCluster.split('_')[1]
		assignmentPath = genePath + '/' + paramForPhyloseq{'gene'}  + '/' + algo + '/' + paramForPhyloseq{'taxonomyFolderName'}
		assignmentPath.chdir()
	}
	
	## if you want to use phyloseq functions that need a phylogenetic tree with the OTUs found, you need to build such a tree!
	if ( paramForPhyloseq{'tree'} == 'Yes' ) {
		
		print("assingmentPath is:  " +  assignmentPath + '\n')
		assignmentPath.chdir()
		
		## at first, we align the sequences of the final OTUs using the MAFFT aligner
		task /usr/bin/mafft --auto --reorder --thread -1 Aligned_assignments.fasta > aligned_otus.fasta		
		wait
		print 'mafft alignmnent has been concluded' + '\n'
		
		sys awk '{ if (substr($1,1,1) ~ /^>/ ) {print $1} else {print $0} }' aligned_otus.fasta > aligned_otus.fasta.temp
		sys rm aligned_otus.fasta
		sys mv aligned_otus.fasta.temp aligned_otus.fasta
		sys sed -i 's/:/_/g' aligned_otus.fasta
		
		## to get a nice tree, we will run the RAxML-ng "check" algorithm in the first place, in oder to check our MSA.
		task $path/tools/raxml-ng/raxml-ng/bin/raxml-ng --check --msa aligned_otus.fasta --model GTR+G4 || true
		wait
		print 'The check of the alignment is done!' + '\n'
				
		## and then I create my tree using RAxML	
		string test = 'aligned_otus.fasta.raxml.reduced.phy'
		if ( test.exists() == 'true' ) {
			task $path/tools/raxml-ng/raxml-ng/bin/raxml-ng --all --msa-format PHYLIP --msa aligned_otus.fasta.raxml.reduced.phy --tree pars{$paramForPhyloseq{'parsTrees'}} --bs-trees $paramForPhyloseq{'bootstrapTrees'} --model GTR+G4 --threads $paramForPhyloseq{'raxmlThreads'} --force		
			wait
			sys mv aligned_otus.fasta.raxml.reduced.phy.raxml.bestTree aligned_otus.fasta.raxml.bestTree
			
		} else {
			print 'The mafft alignment was good for raxml-ng' + '\n'
			task $path/tools/raxml-ng/raxml-ng/bin/raxml-ng --all --msa-format FASTA --msa aligned_otus.fasta --tree pars{$paramForPhyloseq{'parsTrees'}} --bs-trees $paramForPhyloseq{'bootstrapTrees'} --model GTR+G4 --threads $paramForPhyloseq{'raxmlThreads'} --force

			wait	
		}
		wait		
	}
			
	## finally, I have to copy all the necessary PEMA's output files, that phyloseq uses as input, to the phyloseq-output folder
	assignmentPath.chdir()
	task{
		
		## add the OTU-table found in the phyloseq folder and name it as the phyloseq script wants 
		sys cp final_table.tsv $outputFilePath/phyloseq_output
		sys cd $outputFilePath/phyloseq_output
		sys mv final_table.tsv otu_table.txt
		sys sed -i 's/:/_/g' otu_table.txt
		
		## add the metadata file in the phyloseq folder
		sys cp $path/metadata.csv $outputFilePath/phyloseq_output	
	}

	string test2 = "aligned_otus.fasta.raxml.bestTree"
	if ( test2.exists() == 'true' ) {
		task{
			sys cp aligned_otus.fasta.raxml.bestTree $outputFilePath/phyloseq_output
			sys cd $outputFilePath/phyloseq_output
			sys mv aligned_otus.fasta.raxml.bestTree OTUs_tree.tre	
		}			
	}
	
	# run the phyloseq script	
	string phyloseqPath = outputFilePath + '/' + "phyloseq_output"
	phyloseqPath.chdir()
	sys cp $path/phyloseq_in_PEMA.R .
	wait
	
	sys /usr/bin/R-3.6.0/bin/Rscript phyloseq_in_PEMA.R $paramForPhyloseq{'gene'} $paramForPhyloseq{'silvaVersion'}

	wait		
}

###################################################################################################################
#																												  #
#									     phyloseq has been concluded    										  #
#																												  #
###################################################################################################################  


###  make a file where i will move all the checkpoints
parameterFilePath.chdir() ;
if ( paramForPhyloseq{'emptyCheckpoints'} == 'Yes' ) {
	string checkpointsDir = 'checkpoints_for_' + paramForPhyloseq{'outputFolderName'}
	outputFilePath.chdir()
	checkpointsDir.mkdir()
	#parameterFilePath.chdir()
	
	path.chdir()
	sys mv *.chp $outputFilePath/$checkpointsDir
}

# # keep only one OTU-table for every analysis in the case of the 16S marker gene and the alignment based taxonomy assignment method
# if ( paramForPhyloseq{'gene'} == '16S' &&  paramForPhyloseq{'taxonomyAssignmentMethod'} == 'alignment' ) {
# 
# 	string algoCluster = paramForPhyloseq{'clusteringAlgoFor16S_18SrRNA'}
# 	string algo = algoCluster.split('_')[1]	
# 	string assignmentPath = genePath + '/' + "gene_16S" + '/' + algo + '/' + paramForPhyloseq{'taxonomyFolderName'}
# 	assignmentPath.chdir()
# 	sys cd 
# 	sys mv 16S_otu_table.txt 16S_otutab_$paramForPhyloseq{'taxonomyFolderName'}.txt
# }

outputFilePath.chdir()
print(" \n PEMA has come to an end! Let biology start! \n ")

################################################################################################
#																							   #
#								PEMA has been concluded.									   #
#																							   #
################################################################################################