#!/usr/bin/env bds
# This is PEMA!  A metabarcoding pipenline for environmental DNA samples.
# Author: Haris Zafeiropoulos 
# date: 2017-2018
# PEMA needs to be performed inside a container - Docker or Singularity.


#############################################################################################
#########          FUNCTION IN ORDER TO READ parameters.tsv FILE EACH TIME      #############
#############################################################################################

# first of all, read the parameters' file and keep all of them in a map (map is a dictionary for BDS)
# in order to be able to make run from checkpoints, we made a function in order to read the parameter's file all the time


string{} readParameterFile(string parameter_file) {

#  each time i need to change..... 
	string parameters_file = parameter_file
	lines := parameters_file.readLines()
	string{} my_case

	for ( string line : lines ) {
	
		# I need a map with parameteres as keys and their values as values, hence i do not care about lines starting with "#"
		if ( line.startsWith("#") ) {	
		continue
	
		} else {
			string pname, pvalue
			
			(pname, pvalue) = line.split('\t')		# tab - split 
			my_case { pname } = pvalue				# keep 1st elemenet as key (parameter) and the second as its value (parameter's value)
			pname = ''
			pvalue = ''
		}
	}
	
	# return the map 
	return( my_case )								
}

###############################################################################################
################     set PEMA's environment to be able to be executed     #####################
###############################################################################################

# read parameters file for the first time! 
string parameterFilePath = "/mnt/analysis/"
parameterFilePath.chdir()

# set a variable with the parameter file as its value
string f = "parameters.tsv"									
string{} paramsFirstAssignment =  readParameterFile(f)

# keep as a variable the path where PEMA is found
string path = "/home"

# keep as a variable the path where the data are found
string dataPath =  "/mnt/analysis/mydata" 

# keep as a variable the path of a BLAST database 
string blastDatabasePath = "/mnt/databases"

# PEMA will leave its output also in /mnt 
string outputPoint = "/mnt/analysis"

# Quite IMPORTANT step!
# if there are Trimlogs in the rawData file, PEMA does not run as it should. Hence, we remove them if there are any!
# This happens when i re-run PEMA over the same dataset
dataPath.chdir() ;

# check if there is a TrimLog file and IF YES, then delete it
sys if [ $(find "$dataPath" -name "TrimLog*") ] ; then  rm TrimLog* ; fi          

# I create a file where the pipeline's output will be saved
outputPoint.chdir()

# Check if an output folder called with the same name as the one given in the "parameters" file, has already been created from previous runs
if ( paramsFirstAssignment{'outputFolderName'}.isDir() ) {												 
   print "this ouput file already exists\n" 
} else {
	# if it is not, create a file with its name
	paramsFirstAssignment{'outputFolderName'}.mkdir()
	print "a new output files was just created!\n"
}

# take the outut folder name as a variable 
string analysisName = paramsFirstAssignment{'outputFolderName'}

# here are all the directories as variables that i will use
string qualityControl = "1.quality_control" ;
string trimmomatic = "2.trimmomatic_output" ;
string bayesHammer = "3.correct_by_BayesHammer"
string spades = "4.merged_by_SPAdes" ;
string dereplicate = "5.dereplicate_by_obiuniq" ;
string linearized = "6.linearized_files" ;
string geneDependent = "7.gene_dependent"
string outputPerSample = "8.output_per_sample"

 
## and all the paths for each of them
string outputFilePath = outputPoint + "/" + paramsFirstAssignment{'outputFolderName'}
string fastqcPath = outputFilePath +"/" + qualityControl ; 
string trimoPath = outputFilePath +"/" + trimmomatic ;
string bayesPath =  outputFilePath +"/" + bayesHammer ;
string spaPath =  outputFilePath +"/" +  spades ;
string derePath =  outputFilePath +"/" + dereplicate ;
string linearPath =  outputFilePath +"/" + linearized ;
string genePath = outputFilePath +"/" + geneDependent ;
string outputPerSamplePath = outputFilePath +"/" + outputPerSample

# and create an output file for each pipeline step - if you run PEMA for the first time for a specific experiment
outputFilePath.chdir()
# make a list with all the directory names that can be found in the output directory 
string [] outputFiles = outputFilePath.dir()			

# if that list is empty, then create the directories mentioned above
if ( outputFiles.isEmpty() == true ) {																
   # create all the output files
   qualityControl.mkdir() ;
   trimmomatic.mkdir() ;
   bayesHammer.mkdir() ;
   spades.mkdir() ;
   dereplicate.mkdir() ;
   linearized.mkdir() ;
   geneDependent.mkdir()
   outputPerSample.mkdir()
  }

# gene dependent directory
genePath.chdir()																									
paramsFirstAssignment{'gene'}.mkdir()														
wait


#///////////  from this point, PEMA and its tools are about to start running	///////////

##########################################################################################
##########################		1. FastQC			######################################
##########################################################################################

 # If there are Trimlogs in the rawData file, PEMA does not run as it should. Hence, we remove them if there are any!
 # I re-run this command because of FastQC and the way it reacts in multiple runs over the same folders
 dataPath.chdir() ; 
 sys if [ $(find "$dataPath" -name "TrimLog*") ] ; then  rm TrimLog* ; fi

# make a copy of "parameters.tsv" file to the output folder.
# this way the user will be able to get the parameters he used for analysis, anytime!
sys cp $parameterFilePath/parameters.tsv $outputFilePath/parameters0f.$analysisName.tsv

# now BLASTQC runs for all files
string[] rawFiles = dataPath.dir()
for ( string rawFile : rawFiles ) {
	print(rawFile)
	task $path/tools/fastqc/FastQC/fastqc --outdir $fastqcPath  $dataPath/$rawFile
}
wait
print("FastQC is completed!" + "\n")


 # now i create a checkpoint and read again the parameters file.
 # this way if i change any of my parameters, i can do only the steps from here and after with the new ones.
 if ( qualityControl.isEmpty() == false ) {
 	checkpoint "trimming.chp"
}
			
parameterFilePath.chdir()
string g = "parameters.tsv"
string{} paramsTrimmoStep =  readParameterFile(g)
 
###########################################################################################
###########################			2. Trimmomatic			###############################
###########################################################################################

dataPath.chdir() ;
# go to the folder which contains all the raw data and make a list (named 'data') with all the file names that you can find there
string[] data = dataPath.dir()											
  
# set two variables readF and readR for the forward and reverse file of each sample  
string readF																			
string readR
int counter = 0

for ( string file : data ) {
	
	# here's the trick! i need to tell which file is the forward and which is the reverese. Hence, i try to split a suffix. 
	string check = file.split("_2\.fastq\.gz")[0]				

	# if the split does occure then i know that i have the second file of each sample (reverse)
	# that is because the 'file' variable will always have a '.fastq.gz' suffix.
	# hence, if there is no split, 'file' and 'check'  variables are the same.
	if ( file == check ) {															
				readF = file 																		
				print("readF is: " + readF  +"\n")
				
				# if there is a split, then they are different
				} else {
					readR = file
					print("readR is: " + readR  +"\n")
			}
												

	wait

	if ( readF.isEmpty() == false && readR.isEmpty() == false ) {

		if ( paramsTrimmoStep{'maxInfo'} == 'Yes' ) {

			print("You hace selected Max INFO" + "\n")
			wait
			task java -jar $path/tools/Trimmomatic/Trimmomatic-0.38/trimmomatic-0.38.jar PE -threads $paramsTrimmoStep{'threadsTrimmomatic'} -trimlog TrimLog $dataPath/$readF $dataPath/$readR $trimoPath/filtered_max_$readF.1P.fastq.gz $trimoPath/filtered_max_$readF.1U.fastq.gz $trimoPath/filtered_max_$readR.2P.fastq.gz $trimoPath/filtered_max_$readR.2U.fastq.gz ILLUMINACLIP:/$path/tools/Trimmomatic-0.38/adapters/$paramsTrimmoStep{'adapters'}:$paramsTrimmoStep{'seedMismatches'}:$paramsTrimmoStep{'palindromeClipThreshold'}:$paramsTrimmoStep{'simpleClipThreshold'} LEADING:$paramsTrimmoStep{'leading'} TRAILING:$paramsTrimmoStep{'trailing'} MAXINFO:$paramsTrimmoStep{'targetLength'}:$paramsTrimmoStep{'strictness'} MINLEN:$paramsTrimmoStep{'minlen'}
			wait
			readF = ''
			readR = ''

			} else if ( paramsTrimmoStep{'maxInfo'} == 'No' ) {

					print("You have selected no Max INFO" + "\n")

					task java -jar $path/tools/Trimmomatic/Trimmomatic-0.38/trimmomatic-0.38.jar PE -threads $paramsTrimmoStep{'threadsTrimmomatic'} -trimlog TrimLog2 $dataPath/$readF $dataPath/$readR $trimoPath/filtered_$readF.1P.fastq.gz $trimoPath/filtered_$readF.1U.fastq.gz $trimoPath/filtered_$readR.2P.fastq.gz $trimoPath/filtered_$readR.2U.fastq.gz ILLUMINACLIP:/$path/tools/Trimmomatic-0.38/adapters/$paramsTrimmoStep{'adapters'}:$paramsTrimmoStep{'seedMismatches'}:$paramsTrimmoStep{'palindromeClipThreshold'}:$paramsTrimmoStep{'simpleClipThreshold'} LEADING:$paramsFirstAssignment{'leading'} TRAILING:$paramsTrimmoStep{'trailing'}  MINLEN:$paramsTrimmoStep{'minlen'}

					wait

					readF = ''
					readR = ''
			}
	}
}

wait
print "Trimmomatic  is done\n\n" ;


#  let us make a CHECKPOINT from which will be able to restart our analysis from the error correction sterp
path.chdir() ; trimoPath.chdir() ;
if ( trimoPath.isEmpty() == false ) {
	checkpoint "errorCorrection.chp"
}

parameterFilePath.chdir()
string h = "parameters.tsv"
string{} paramsBayesHammerStep =  readParameterFile(h)
 
################################################################################################
##################  3.  BayesHammer - error correction // SPAdes    ############################
################################################################################################
 
# make a list with all the file names that there are in the folder with Trimmomatic's
# output -- e.g: suffix : '2.fastq.gz.2U.fastq.gz' 
trimoPath.chdir()
string[] trimos = trimoPath.dir()										
 
for (string file : trimos ) {
	# we delete all the 'U.fastq.gz' files using the list we just made
	if ( file.endsWith("U\.fastq\.gz") ) {			
		file.delete()
	}
}

# and then we make a new list with the files that remained 
string[] finalTrimos = trimoPath.dir()				

string readBayesF
string readBayesR
string newDir
string newFile
wait

for ( string trimmed : finalTrimos ) {
	
	# as in Trimmomatic step, we need to know the forward and the reverse file
	string trimCheck = trimmed.split("_2\.fastq\.gz\.2P\.fastq.\gz")[0]							

	if ( trimmed == trimCheck ) {
		readBayesF = trimmed
		} else {
		readBayesR = trimmed
		}

	if ( readBayesF.isEmpty() == false && readBayesR.isEmpty() == false ) {
		
		if ( paramsBayesHammerStep{'maxInfo'} == 'Yes' ) {
			string newName = trimCheck.split("filtered\_max\_")
			newDir = newName.substr(3,13) ;
			print("trimCheck: " + trimCheck + "\n" + "newDir: " + newDir + "\n")

		} else if ( paramsBayesHammerStep{'maxInfo'} == 'No' ) {
			string newName = trimCheck.split("filtered_")
			newDir = newName.substr(3,13) ;
			print("trimCheck: " + trimCheck + "\n" + "newDir: " + newDir + "\n")
		}
		
		# working where the output of BayesHammer is located!
		bayesPath.chdir() ; 
		newDir.mkdir() ;
		
		# going back to Trimo's output
		trimoPath.chdir()
		wait

		# and I execute the BayesHammer algorithm that it is in SPAdes
		
		sys which gzip
		sys gzip --version
		print("readBayesF: " + readBayesF +"\n" + "and readBayesR: " + readBayesR + "\n" )
		task $path/tools/SPAdes/SPAdes-3.13.0-Linux/bin/spades.py --only-error-correction -o $bayesPath/$newDir -1 $readBayesF -2 $readBayesR --disable-gzip-output

		wait
	
		# clear all variables that are used in BayesHammer step in order to be re-used
		readBayesF = ''
		readBayesR = ''		
		newFile = ''
		newDir = ''		
	}
}

wait
print("Error correction using BayesHammer is completed!" + "\n") 
 
#  let us make a checkpoint to start our pipeline from the  merging step!
if ( bayesPath.isEmpty() == false ) {
	checkpoint "mergingWithSpades.chp"
	}

parameterFilePath.chdir()
string i = "parameters.tsv"
string{} paramsSpadesMerging =  readParameterFile(i)
 
#############################################################################################
############################  4. PANDAseq - merging/overlapping    ##########################
#############################################################################################

bayesPath.chdir()
 
# keep in a list all files that came from the BayesHammer algo leaving aside some stats file
# make a list with all the folders that exist in Bayes Hammer output folder - only sample names (e.g. SRR1643855 ) 
string[] correct = bayesPath.dir()											
correct.remove("input_dataset.yaml") ;
correct.remove( "params.txt" ) ;
correct.remove("spades.log") ;
correct.remove( "tmp")

# we need to "create" the names of forward and reverse files each time
string baseName								
if ( paramsSpadesMerging{'maxInfo'} == 'Yes' ) {
	
	# we keep as 'baseName' the prefix of every file that depends on whether  we had maxInfo or not
	baseName = "filtered_max_"
} else {
	baseName = "filtered_"
}
 
for ( string correctFile : correct )  {

	string uniqPath = bayesPath + '/' +  correctFile + '/' + 'corrected' ;
	# we enter to the directory of a specific sample each time in the output folder of BayesHammer - previous step
	uniqPath.chdir() ;																												
	
	sys /bin/gzip *.fastq
	
	# in the corrected file of each sample there are two files - one for forward, one for reverse -
	# and I make two variables with them
	string forward = "_1.fastq.gz.1P.fastq.00.0_0.cor.fastq.gz"								
	string reverse = "_2.fastq.gz.2P.fastq.00.0_0.cor.fastq.gz"		
	string forwardFile= baseName + correctFile + forward ;
	print(forwardFile + "\n")
	string reverseFile = baseName + correctFile + reverse ;
	print(reverseFile + "\n")

	wait
	
	# if i have sample names not in terms of Illumina and I need to use PANDAseq, i have to change the sample names in the rawdata as "ERR0001" for example.
	if ( paramsSpadesMerging{'EnaData'} == 'No') {
		uniqPath.chdir() 
		sys bash  $path/scripts/prepareForPandaseq.sh $forwardFile $reverseFile
		
	}
	
	# finally, we execute SPAdes algorithm for merging 
	task $path/tools/PANDAseq/bin/pandaseq -f $forwardFile -r $reverseFile -6 -A  $paramsSpadesMerging{'pandaseqAlgorithm'} -a -B -T $paramsSpadesMerging{'pandaseqThreads'} > $correctFile.merged.fastq.gz

	wait

	if ( paramsSpadesMerging{'EnaData'} == 'No') {
		string mergedFile = correctFile + ".merged.fastq.gz"
		sys $path/scripts/recoverSampleNames.sh $mergedFile
	}

	sys mv *.merged.fastq.gz $spaPath
	bayesPath.chdir()
}
wait

# now that we have run SPAdes for merging for all our samples,
# we unzip all my merged files in a new file named "unzip"
# we go to the folder with the output of SPAdes and we make a list with its elements
spaPath.chdir() ;														
string[] merged = spaPath.dir() ;
string unzip = "unzip" ;
# we make a new directory named 'unzip'
unzip.mkdir() ;															
string unzipPath = "$spaPath/unzip" ;
# and we remove the element 'unzip' from the list we made
merged.remove("unzip")										

for ( string anyFile : merged ) {
	sys cp $anyFile $unzipPath
	unzipPath.chdir() ;
	string newFile = anyFile.baseName(".gz")
	sys mv $anyFile $newFile
	spaPath.chdir()
}

wait

print("Merging step by SPAdes is completed" +"\n")

# we make a checkpoint to start our pipeline from the dereplication step!
if ( spaPath.isEmpty() == false ) {
checkpoint "dereplicationStep.chp"
}

parameterFilePath.chdir()
string j = "parameters.tsv"
string{} paramsDereplication =  readParameterFile(j)

##########################################################################################################
#########################     5. DEREPLICATE me ti voitheia ton OBITools !!           ####################
#########################################################################################################
 
unzipPath.chdir() ;
string[] unzips = unzipPath.dir()

string fileForDerepl
for ( string nodereplicate : unzips ) {
	# now i execute OBITools
	task $path/tools/OBITools-1.2.12/bin/obiuniq -m sample $nodereplicate > $derePath/dereplicate_$nodereplicate ;
	wait
}
wait
print("all the first steps are done! clustering is about to start!" +"\n")
 
checkpoint "tableStep.chp"
parameterFilePath.chdir()
string k = "parameters.tsv"
string{} paramsOfTable =  readParameterFile(k)

##############################################################################################################
#########################################       MERGING ALL FILES IN (1) ONE     ############################
#############################################################################################################

derePath.chdir()
# we make a list with all the file names that exist in the folder with the dereplication-step output
string[] dereplicates = derePath.dir()										
 
for ( string derepl : dereplicates ) {
	# and we make some transforms in all these files
	task {
		# we substitue both 'merged_sample...' amd ';' with '_' and nothing respectively
		sys sed 's/ merged_sample={}; count=/_/g ; s/;//g' $derepl > se.$derepl
		# merge all lines of a fastq entry into one and only one line!
		sys awk 'NR==1 {print ; next} {printf /^>/ ? "\n"$0"\n" : $1} END {printf "\n"}' se.$derepl > nospace.$derepl
		# we turn all down- to upcases
		sys cat nospace.$derepl | tr '[a-z]' '[A-Z]' > $linearPath/linearized.$derepl 																		
	}
}
wait

derePath.chdir()
sys rm se.* nospace.*

# we go to the folder with the linearized data and make a list with all its contents  
linearPath.chdir()	  																			
string[] linears = linearPath.dir()
# we remove from these files some characteristic symbols
for ( string sample : linears ) {   																			
	task{
		 sys sed 's/\:\:\:/\:/g' $sample > $sample.f.fasta
		 sys sed 's/\:0\:0\:0\://' $sample.f.fasta > $sample.g.fasta
		}
}
wait
 
linearPath.chdir()
sys rm *.f.fasta
wait

# this command forces applications to use the default language for output, and forces sorting to be bytewise.  
sys export LC_ALL=C 									

# here I read all the files I have made at once and run over them an awk command in which I set as record separator ">" and as field separaton '_' or new line.
# from the second line I make an array with the sample as keys and their abundances as value and anotherone with the otus as keys and the sequnece asw value.
# finally, I make a file in which in every line i have the amplicon name (sample) its abundance and its sequence.

task cat *.g.fasta | awk 'BEGIN {RS = ">" ; FS = "[_\n]"} {if (NR != 1) {abundances[$1] += $2 ; sequences[$1] = $3}} END {for (amplicon in sequences) {print ">" amplicon "_" abundances[amplicon] "_" sequences[amplicon]}}' | sort --temporary-directory=$(pwd) -t "_" -k2,2nr -k1.2,1d | sed -e 's/\_/\n/2' > $outputFilePath/all_samples.fasta

wait	

task awk '/N/{n=2}; n {n--; next}; 1' < $outputFilePath/all_samples.fasta  > $outputFilePath/final_all_samples.fasta

wait
sys rm *.g.fasta
 

######################################################################################################################################
## in the file 'final_all_samples.fasta' we now have all the reads from all our samples and we are able to move on CLUSTERING - STEP 				      
## however, even if until now the procedure was the same no matter what gene we had,
## we now have to choose between different paths depending on the gene and our needs 
######################################################################################################################################

# depending on what marker gene and kind of clustering algorithm (for the COI case) we have, we make the right folders
genePath.chdir()																									
if ( paramsOfTable{'gene'} == 'gene_COI' ) {
	string coiPath = genePath + '/' + 'gene_COI'
	coiPath.chdir()
	if ( paramsOfTable{'clusteringAlgo'}  == 'algo_CROP' ) {
		string algo = 'CROP'
		algo.mkdir()
	} else if ( paramsOfTable{'clusteringAlgo'} == 'algo_SWARM' ) {
		string algo = 'SWARM'
		algo.mkdir()
	}
}

###################################################################################################################
##################################################     16S CASE     ###############################################
###################################################################################################################


if  ( paramsOfTable{'gene'} == 'gene_16S' ) {
	
	print " my marker gene is 16S\n "
	## chimera removal & OTU clustering with VSEARCH  -  the algorithm  "-cluster_otus" execute BOTH chimera removal and clustering
	string sixteenPath = genePath + '/' + "gene_16S"
	outputFilePath.chdir()
	task sed 's/_/;size=/g' final_all_samples.fasta > $sixteenPath/16S_all_samples.fasta
	wait

	######################   here i execute the VSEARCH algorithms    #############################################
			
	sixteenPath.chdir()
	
	# demultiplexing
	task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --threads $paramsOfTable{'vsearchThreads'} --derep_fulllength 16S_all_samples.fasta --minuniquesize 2 --sizein --sizeout --fasta_width 0 --uc all.derep.uc --output all.derep.fasta
	
	wait
	
	# execute the first clustering step
	task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --threads $paramsOfTable{'vsearchThreads'} --cluster_size all.derep.fasta --id $paramsOfTable{'vsearchId'} --strand both --sizein --sizeout --fasta_width 0 --uc all.preclustered.uc --centroids all.preclustered.fasta
	
	wait

	# chimera removal 
	task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --threads $paramsOfTable{'vsearchThreads'} --uchime_denovo all.preclustered.fasta --sizein --sizeout --fasta_width 0 --nonchimeras all.denovo.nonchimeras.fasta

	wait
	
	# change an output file name
	sys mv all.denovo.nonchimeras.fasta all.ref.nonchimeras.fasta
	
	# here are two perl scripts that are located in the "scripts" folder
	task $path/scripts/map_vsearch.pl all.derep.fasta all.preclustered.uc all.ref.nonchimeras.fasta > all.nonchimeras.derep.fasta
	wait

	task $path/scripts/map_vsearch.pl 16S_all_samples.fasta all.derep.uc all.nonchimeras.derep.fasta > all.nonchimeras.fasta
	wait

	# execute the second clustering step 
	task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --threads $paramsOfTable{'vsearchThreads'} --cluster_size all.nonchimeras.fasta --id 0.97 --strand both --sizein --fasta_width 0 --uc all.clustered.uc --relabel Otu --centroids all.otus.fasta --otutabout all.otutab.txt

	wait
	
	sys mv all.otus.fasta 16S_all_samples.otus.fa
	sys mv all.otutab.txt 16S_otutab.txt
	sys cp 16S_otutab.txt 16S_otutab_$paramsOfTable{'taxonomyFolderName'}.txt
		
	
	####### finally i execute the TAXONOMY ASSIGNMENT step using CREST and SILVA database  (alignment based)    ###########
	
	if ( paramsOfTable{'taxonomyAssignmentMethod'} == 'alignment' ) {
		
		sixteenPath.chdir()
		
		
		if ( paramsOfTable{'silvaVersion'} == 'silva_128' ) {
			# first we use megablast algorithm of blastn for sequence identification, intra-species comparison 						
			task $path/tools/ncbi-blast-2.8.1+/bin/blastn -task megablast -query 16S_all_samples.otus.fa -db $path/tools/CREST/LCAClassifier/parts/flatdb/silvamod/silvamod128.fasta -num_alignments 100 -outfmt 5 -out 16S_mysilvamod128_$paramsOfTable{'taxonomyFolderName'}.xml -num_threads 10
			
			wait
			
			# and then we use the classifier in order to get our assignment
			task $path/tools/CREST/LCAClassifier/bin/classify -t 16S_otutab_$paramsOfTable{'taxonomyFolderName'}.txt -o $paramsOfTable{'taxonomyFolderName'} 16S_mysilvamod128_$paramsOfTable{'taxonomyFolderName'}.xml
			
		# we follow the same steps for the case of the Silva 132 	
		} else if ( paramsOfTable{'silvaVersion'} == 'silva_132' ) {
			
			task $path/tools/ncbi-blast-2.8.1+/bin/blastn -task megablast -query 16S_all_samples.otus.fa -db $path/tools/CREST/LCAClassifier/parts/flatdb/silva132/silva132 -num_alignments 100 -outfmt 5 -out 16S_mysilvamod132_$paramsOfTable{'taxonomyFolderName'}.xml -num_threads 10
			
			wait
			
			task $path/tools/CREST/LCAClassifier/bin/classify -c $path/tools/CREST/LCAClassifier/parts/etc/lcaclassifier.conf -d silva132 -t 16S_otutab_$paramsOfTable{'taxonomyFolderName'}.txt -o $paramsOfTable{'taxonomyFolderName'} 16S_mysilvamod132_$paramsOfTable{'taxonomyFolderName'}.xml
			
			wait
			
		}				
		
	
	## what about another way for taxonomy assignment for the case of the 16S marker gene?
	## here is the phylogeny-based approach that PEMA supports,  using RAxML-ng and SILVA (v.128) ##

	} else if ( paramsOfTable{'taxonomyAssignmentMethod'} == 'phylogeny'  ) {
		
		## we need a REFERENCE TREE which we generated it from SILVA database with the aid of GAPPA.
		## we used GAPPA in order to keep only 10.000 taxa from SILVA.
		## we then used RAXML - NG which it returnerd us our REF TREE - you can find the corresponding script for getting this tree in  $path/tools/silva_132/for_placement
		## in order to get our PHYLOGENY based taxonomy assignment we make use of EPA - NG // last version of Evolutionary Placement Algorithm
		
		# PaPaRa returns a msa of our queries that has taken into account info from our ref tree 
		task $path/tools/PaPaRa/papara_static_x86_64 -t $path/tools/createTreeTheEasyWay/raxml_easy_right_refmsa.raxml.bestTree -s $path/tools/createTreeTheEasyWay/raxml_easy_right_refmsa.raxml.reduced.phy -q $genePath/gene_16S/16S_all_samples.otus.fa -r 
		wait
		
		# we remove the ref - msa - THIS IS A BUG OF EPA-ng !!!!
		sys tail -n +1038 $sixteenPath/papara_alignment.default > $sixteenPath/papara_alignment.phy
		wait

		# we convert our query msa to fasta format with a script we made because epa-ng can handle only fasta
		sixteenPath.chdir()
		task bash $path/scripts/convertPhylipToFasta.sh  papara_alignment.phy
		
		wait
				
		# execute epa-ng algorithm (evolutinary placement) in order to get our phylogenetic-based taxonomy assignment, at last!
		task $path/tools/EPA-ng/epa/bin/epa-ng -s $path/tools/createTreeTheEasyWay/raxml_easy_right_refmsa.raxml.reduced.phy.fasta -t $path/tools/createTreeTheEasyWay/raxml_easy_right_refmsa.raxml.bestTree -q $genePath/gene_16S/papara_alignment.fasta -w $genePath/gene_16S/ --model GTR{0.959447/2.52497/1.45842/1.06213/3.55143/1}+FU{0.237221/0.230211/0.294337/0.238231}+G4m{0.559838}
		wait
		print("phylogenetic based taxonomy assignemnt worked fine!")

		string assignFolder="16S_taxon_assign_phylogeny_assignment"
		sixteenPath.chdir()
		assignFolder.mkdir()
		sys mv epa* $assignFolder
	}
}


####################################################################################################
#######################################         COI CASE    ######################################## ####################################################################################################

### in the COI case, we will have an IF statement for the two different clustering algos and THEN, after, we will do the taxonomy assignment!!!   

if  ( paramsOfTable{'gene'} == 'gene_COI' ) {

	print 'my marker gene is COI' + "\n"

	## in this case, there is not an algo that does both chimera removal and clustering, hence we have a 2 - step procedure
	## an kai stin arxi eixame pei prota chimera removal kai meta otu clustering, telika pame antistrofa
		
	######################   CROP or SWARM   ##################################
	
	if ( paramsOfTable{'clusteringAlgo'} == 'algo_SWARM' ) {
		
		#### Prepare files
		outputFilePath.chdir()
		string swarmPath = genePath + '/' + 'gene_COI' +  '/' + 'SWARM'
		task cp final_all_samples.fasta $swarmPath/COI_SWARM_all_samples.fasta
		wait
			
		#####  SWARM   ####
		###### we build a contingency table for AMPLICONS  - here the awk.sh file is a bash script that you can find in the PEMA file
		linearPath.chdir()
		
		task{
			sys bash $path/scripts/createContigencyTable.sh
			sys sed 's/linearized.dereplicate_//g; s/.merged.fastq//g' amplicon_contingency_table_temp.csv >  amplicon_contingency_table.csv
			sys  mv  amplicon_contingency_table.csv  $swarmPath
		}
		wait
		print 'the contigency table has been created successfully!' + "\n"
		
		##### Run SWARM and then chimera removal
		swarmPath.chdir()
		
		## here is the clustering step with SWARM algorithm 	
		if ( paramsOfTable{'d'} == '1' ) {
			task $path/tools/swarm/swarm/bin/swarm -d 1 -f -t 10  -s SWARM.stats -w SWARM_OTU_representatives_all_samples.fasta < COI_SWARM_all_samples.fasta > SWARM.swarms		
		} else {
			task $path/tools/swarm/swarm/bin/swarm -d $paramsOfTable{'d'} -t 40  -s SWARM.stats -w SWARM_OTU_representatives_all_samples.fasta < COI_SWARM_all_samples.fasta > SWARM.swarms
		}
		wait
		print('the swarm algorithm worked just fine!' + "\n")
		
		# and now i handle its output in order to take it in the form i want to. turn lowcase to uppercase and substitute the '_' to 'size='
		task {
			  sys cat SWARM_OTU_representatives_all_samples.fasta | tr '[a-z]' '[A-Z]' > upper.SWARM_OTU_representatives_all_samples.fasta
			  sys sed 's/_/;size=/g' upper.SWARM_OTU_representatives_all_samples.fasta > true.SWARM_OTU_representatives_all_samples.fasta
		}
		wait
		
		## 'sortbysize' is an algorithm for sorting by size (abundance)
		task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --sortbysize true.SWARM_OTU_representatives_all_samples.fasta --output SWARM_final_OTU_representative.fasta
		wait
		print('sortbysize algorithm ran ok!' + "\n")
		
		## and the 'uchime3_denovo' is an algorithm for finding chimeras
		task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch  --uchime3_denovo SWARM_final_OTU_representative.fasta --nonchimeras SWARM_otu_no_chimera.fasta --abskew $paramsOfTable{'abskew'}
		wait
		print('the uchime3_denovo algorithm for finding chimeras just finished!' + "\n")

		string[] swarmInterFIles = swarmPath.dir()
		for (string file : swarmInterFIles ) {
			if ( file.startsWith('upper') ) {
				file.delete()
			}
			if  ( file.startsWith('true') ) {
				file.delete()
			}
		}		
		
		##### at last, we create an OTU table that comes after the SWARM results. Again, we have an awk command in a bash script 
		swarmPath.chdir()
		sys mv $linearPath/amplicon_contingency_table_temp.csv $swarmPath
		wait
		sys bash $path/scripts/createOtuContigencyTable.sh
		print('the contigency table with the found otus was just created!' + "\n")

	}
	#### here ends ths "if" statement of SWARM clustering   (   ATTENTION TO HAVE 1 TAB HERE !! )

			####################################   here is another way for CLUSTERING on case of COI    ####################################
	
	if ( paramsOfTable{'clusteringAlgo'} == 'algo_CROP' ) {
		string croPath = genePath + '/' + 'gene_COI'  + '/' + 'CROP'
		
		
		####   CROP    #####
		##  In this case we do chimera removal first and then we apply the clustering algorithm  ----------  Until now (23/3) we think we prefer CROP over SWARM
		
		outputFilePath.chdir()
		
		task sed 's/_/;size=/g' final_all_samples.fasta > COI_CROP_all_samples.fasta
		wait
		sys cp COI_CROP_all_samples.fasta $croPath
		wait
		
		croPath.chdir()
		
	
		### chimera removal
		task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --uchime3_denovo COI_CROP_all_samples.fasta --nonchimeras no_chim_COI_CROP_all_samples.fasta --abskew $paramsOfTable{'abskew'}
		wait


		### calculate parameters for CROP clustering algorithm	
		croPath.chdir()
		sys bash $path/scripts/croParameters.sh
		wait
		
		print 'The clustering of the MOTUs by the CROP algorithm, has been concluded!'  + "\n"
	}
}

#######################################################################################################
##########     TAXONOMY ASSIGNMENT for the COI case using RDPClassifier (alignment based)    ##########
#######################################################################################################

if ( paramsOfTable{'gene'} == 'gene_COI' &&  paramsOfTable{'clusteringAlgo'} == 'algo_SWARM' ) {

	####    SWARM    #####
	string swarmPath = genePath + '/' + 'gene_COI' +  '/' + 'SWARM'
	swarmPath.chdir()

	### I remove all  new lines and make each sequence an one ana only line kai kano oli tin allilouxia mia grammi
	task awk 'NR==1 {print ; next} {printf /^>/ ? "\n"$0"\n" : $1} END {printf "\n"}' SWARM_otu_no_chimera.fasta > nospace_swarm.fasta
	wait
	print('nospace_swarm is fine!' +"\n")

	### I remove all singletons
	task sed '/size=1/,+1 d' nospace_swarm.fasta > no_singletons.fasta
	sys bash $path/scripts/removeSingletons.sh
	wait
	print('no_singletons file is fine' + "\n")

	### I run RDPClassifier - train - step has already be done and as  reference database we use MIDORI
	task java -Xmx64g -jar $path/tools/RDPTools/classifier.jar classify -t $path/tools/RDPTools/TRAIN/rRNAClassifier.properties -o tax_assign_swarm_COI_temp.txt no_singletons.fasta
	wait

	task{
		sys sed 's/.*root\t1.0\t//g' tax_assign_swarm_COI_temp.txt > taxonomies.txt
		sys awk '{print $1}' tax_assign_swarm_COI_temp.txt > ids.txt
		sys paste ids.txt taxonomies.txt > tax_assign_swarm_COI.txt
		sys rm ids.txt taxonomies.txt tax_assign_swarm_COI_temp.txt
	}
	print 'rdpclassifier has ran as it should ' + "\n"
	
	swarmPath.chdir()
	string noSpaceFile = 'nospace_swarm.fasta'
	string noSingletonsFile = 'no_singletons.fasta'
	noSpaceFile.delete()
	# noSingletonsFile.delete()
	wait
	
	## we keep in a new file, only the otus that have above  97% similartity with some species and in another just the uniq species
	swarmPath.chdir()
	#sys bash $path/scripts/keepOnlyPerCentSimilartityForSwarm.sh
	print 'all the taxonomy assignment procedure for the SWARM case has been completed!'  + "\n"
		
}

	#########################################    CROP  case   !!!!    ##################################################

wait

if  ( paramsOfTable{'gene'} == 'gene_COI' &&  paramsOfTable{'clusteringAlgo'} == 'algo_CROP' ) {
	
	string croPath = genePath + '/' + 'gene_COI'  + '/' + 'CROP'
	croPath.chdir()
	
	print 'The taxonomy assignment with the CROP output has STARTED! '
	
	## intermidate file
	task {
		  sys cat CROP_output.cluster.fasta | grep ">" > heads.txt
		  sys sed 's/=[0-9]*/=/g' heads.txt > heads2.txt
		  sys cat CROP_output.cluster.fasta | grep -v ">"  > seqs.fasta
		  sys awk '{print $2}' CROP_output.cluster > abundancies.txt
		  sys tail -n +2 abundancies.txt > abundancies2.txt
		  sys paste heads2.txt abundancies2.txt > new_heads.txt
		  sys sed 's/\t//g' new_heads.txt > final_heads.txt
		  sys paste -d'\n' final_heads.txt seqs.fasta > crop_for_rdpclassifier.fasta
		  sys rm heads.txt heads2.txt seqs.fasta abundancies.txt abundancies2.txt new_heads.txt final_heads.txt
	}
	
	print 'I have made the file I need for the taxonomy assignment' + "\n"
	
	## and now RDPClassifier can be performed
	croPath.chdir()
	task java -Xmx64g -jar $path/tools/RDPTools/classifier.jar classify -t $path/tools/RDPTools/TRAIN/rRNAClassifier.properties -o taxonomy_from_rdpclassifer_COI_CROP.txt crop_for_rdpclassifier.fasta
	
	
	croPath.chdir()
	sys bash $path/scripts/keepOnlyPerCentSimilartityForCrop.sh
}


########################################################################################################################
##############################              here starts the 'makeOtuTable' script    ...    take as much information as possible        ################
########################################################################################################################

string fileFromRdpClassifier
string textFinal
string perSampleTriples

# make as variables the files that will be used and created, depending on the clustering algo
if (  paramsOfTable{'gene'} == 'gene_COI'  && paramsOfTable{'clusteringAlgo'} == 'algo_SWARM' ) {
	string swarmPath = genePath + '/' + 'gene_COI' +  '/' + 'SWARM'
	fileFromRdpClassifier = 'tax_assign_swarm_COI.txt'
	textFinal = 'finalTriplesFromSwarmClustering.txt'
	perSampleTriples = 'finalTriplesPerSampleFromSwarmClustering.txt'
	swarmPath.chdir()
} else if (  paramsOfTable{'gene'} == 'gene_COI'  && paramsOfTable{'clusteringAlgo'} == 'algo_CROP' ) {
	string croPath = genePath + '/' + 'gene_COI' +  '/' + 'CROP'
	fileFromRdpClassifier = 'taxonomy_from_rdpclassifer_COI_CROP.txt'
	textFinal = 'finalTriplesFromCropClustering.txt'
	perSampleTriples = 'finalTriplesPerSampleFromCropClustering.txt'
	croPath.chdir()
}
wait

# we check whether the output of the classifier is non-zero
if (  paramsOfTable{'gene'} == 'gene_COI'  && fileFromRdpClassifier.size() != 0 ) {
	print ' we have a RDPClassifier output file to work with!!! '
	bool debugCode = false
	
	string{} giveSampleAndTaxonomyGetSizeSum
	string{} sampleIdentifiers 																																										#set of all samples encountered
	string{} taxaNames 																																														 #set of all assigned taxa
	
#####    from output file from both SWARM and CROP clustering  algorithms and RDPClassifier afterwards, I get an 'assignment.txt' file, from which  i get all the information i need   
#####    here, i keep only the taxonomy that is from a threshold and above ( e.g >97%) and i merge all otus with the same taxonomy in the same sample in one entry
#####    finally, i sum up the sizes of the otus i merged
	
	string[] lines = fileFromRdpClassifier.readLines()
	for ( string line : lines ) {
	# input file looks like:
	#ERR1308250:22628;size=4349_TAB__TAB_root_TAB_root_TAB_1Eukaryota	superkingdom	1	Arthropoda	phylum...
	#0	1	2	3	4	5	6	7	8	9	10...
	#PARSE	SKIP	SKIP	SKIP	PARSE	PARSE	SKIP	PARSE	PARSE	SKIP...
	
		print line + "\n"
		string[] lineElements = line.split("\t")	   
		string sample = lineElements[0].split(':')[0] 																																		#parse 1st column (index 0), get first component
		string sizeAsString = lineElements[0].split('=')[1]																																#parse 1st column (index 0), get last component
		int size = sizeAsString.parseInt()

		if ( debugCode ) {
			print "TEST1\t "+ sample + "\t" + size + "\n"
		}

		string thereIsMoreToRead = 'true'

		#current triplet refers to e.g. "Eukaryota	superkingdom	1" columns with index 1,2,3
		int currentTripletStartColumn=1 #start parsing now from the 2nd column, ie. with index 1
		string assigned_taxo=""
		string assignmentConfidence=0

		while ( thereIsMoreToRead == 'true' ) {
			assigned_taxo = lineElements[currentTripletStartColumn] #read column with index 5, 8, 11...
			if ( debugCode ) {
				print "t1:" + currentTripletStartColumn + ":" + assigned_taxo + "\n"
			}

			#columns with index 6, 9, 12,... are skipped
			#read column with index 7, 10, 13...
			assignmentConfidence = lineElements[currentTripletStartColumn + 2]
			print "the assigned taxo is " + assigned_taxo + "\n"
			print "assignmentConfidence is " + assignmentConfidence + "\n"
			print "size is equal to: " + size + "\n"
			if ( debugCode ) {
				print "t2:" + currentTripletStartColumn +":" +assignmentConfidence + "\n"
			}
			#if no next taxon triple follows return 
			int nextTripletConfidenceColumn = currentTripletStartColumn + 5 
			if ( debugCode ) {
				print "t3:" + currentTripletStartColumn + " index out of max "+ lineElements.size() - 1 +"\n"
			}
			#inspect column with index 8, 11, 14... for existence
			if ( nextTripletConfidenceColumn > lineElements.size() ) { 
				#the whole table has been read and the species level reached
				if ( debugCode ) {
					print "t4: column with index " + nextTripletConfidenceColumn + " does not exist\n"
				}
				thereIsMoreToRead = 'false'
				continue;
			}
			#if the confience of the next taxon is below confidence return
			if ( lineElements[nextTripletConfidenceColumn] < 0.97 ) {
				if ( debugCode ) {
					print "t5:" + nextTripletConfidenceColumn + "\n"
				}
				thereIsMoreToRead = 'false'
				continue;
		   }

			currentTripletStartColumn = currentTripletStartColumn + 3 #move to next taxon triple

		} #end of while to read taxa and confidence levels
		
		if ( debugCode ) {
			print "TEST2\t "+ sample + "\t" + assigned_taxo + "\t" + assignmentConfidence + "\t" + size + "\n"
		}
	
	#######################     till here, my code finds the taxon level   ####################################
			
		sampleIdentifiers { sample } = 1;
		taxaNames { assigned_taxo } = 1;
		
		wait
			
		#sum size to any previous size sum corresponding to SampleAndTaxonomy pair
		if ( giveSampleAndTaxonomyGetSizeSum.hasKey(sample+'_SEP_'+assigned_taxo ) == 'true') {
			int newSum = giveSampleAndTaxonomyGetSizeSum { sample+'_SEP_'+assigned_taxo }.parseInt()
			print("size is " + size)
			newSum += size
			print("new sum is " + newSum)
			giveSampleAndTaxonomyGetSizeSum { sample+'_SEP_'+assigned_taxo } = newSum
		} else {
			giveSampleAndTaxonomyGetSizeSum { sample+'_SEP_'+assigned_taxo } =  size
		}
		
		if ( debugCode ) {
			print "TEST3\t"+ sample+'_'+assigned_taxo + "-" + giveSampleAndTaxonomyGetSizeSum{ sample+'_'+assigned_taxo }  +"\n"
		}
	} #end of each input file line
	
	wait
	
	if ( debugCode ) {
		print giveSampleAndTaxonomyGetSizeSum
	}
	
	#print output based on the giveSampleAndTaxonomyGetSizeSum map
	string sampleToPrint
	string sizeSumToPrint
	string taxonomyToPrint
	wait
	for(string sampleAndTaxon : giveSampleAndTaxonomyGetSizeSum.keys() ) {    
		sampleToPrint =  sampleAndTaxon.split('_SEP_')[0]
		sizeSumToPrint =  giveSampleAndTaxonomyGetSizeSum{ sampleAndTaxon }
		taxonomyToPrint =  sampleAndTaxon.split('_SEP_')[1]
		sys echo '$sampleToPrint    $sizeSumToPrint    $taxonomyToPrint ' >>  $textFinal																									########   to lathos (pou paizei gia tin ora) vrisketai sti dimiourgia tou textFinal, diladi tou finalTriplesFromSwarmClustering.txt kai oxi tou  PER SAMPLE !!
	}
	
	if ( debugCode ) {
		print taxaNames
		print sampleIdentifiers
	}
	
	## double for-loop with all the sample id and taxa name possible combinations printing the real ones,
	## the ones that have a value that is
	string sizeToPrintPerSample
	for (string sampleId : sampleIdentifiers.keys()) {
		for ( string currentTaxon : taxaNames.keys() ){
			
			## print only for the sample and taxonomy pairs that have a size
			if ( giveSampleAndTaxonomyGetSizeSum.hasKey( sampleId+"_SEP_"+currentTaxon)) {
				sizeToPrintPerSample = giveSampleAndTaxonomyGetSizeSum { sampleId+"_SEP_"+currentTaxon }
				sys echo '$sampleId    $sizeToPrintPerSample    $currentTaxon' >>  $sampleId.$perSampleTriples
			}
		}
	}
	print 'i did just find with the tripltes! ' + "\n"
	wait
	
	# finaly, make an OTU table only with the significant assignments (>97%)
	if ( paramsOfTable{'clusteringAlgo'} == 'algo_SWARM' ) {
		string swarmPath = genePath + '/' + 'gene_COI' +  '/' + 'SWARM'
		swarmPath.chdir()
		sys bash $path/scripts/createOtuTableOnlyForTaxonomyAssigned.sh		
	}
	
	if ( paramsOfTable {'clusteringAlgo'} == 'algo_CROP') {
		string cropPath = genePath + '/' + 'gene_COI' +  '/' + 'CROP'
		cropPath.chdir()
		sys bash $path/scripts/createOtuTableOnlyForTaxonomyAssigned.sh
	}

}

print ' At last! We now have the MOTU-table! :) ' + "\n"
wait
###################################     here ends 'make0tuTable' script  !!!    ###############################################

#////////////////////////////////////////////////////////////////////////////////////////////////////////

##############################################################################################################################
####  now, i have everything i need to, so I make files per sample in which i keep only the info I consider as valuable ! #### ##############################################################################################################################

bayesPath.chdir()
string[] names = bayesPath.dir()
outputPerSamplePath.chdir()

# for each sample we make a folder with its name, in order to put PEMA's output concerning each of those
for ( string sample : names ) { 																						
    sample.mkdir()
}

fastqcPath.chdir()
# in 'samples' we make a list with all file names we find in fastqc output folder - that means for all the files
string[] samples = fastqcPath.dir()																				


# we remove all .html files from our list
for (string file : samples ) { 																							
	if ( file.endsWith(".html") ) {
		samples.remove(file)
	}
}

#set variables 
string unzipfq 
string sampleNoZip
fastqcPath.chdir()

for ( string sample : samples ) {
	
	# we unzip the output of fastqc for each file
	task unzip $fastqcPath/$sample  																			
	sampleNoZip = sample.split('.zip')[0]
	unzipfq = sample.split('_[0-9]_fastqc')[0]
	wait
	
	# we copy two significant output files of FastQC to the sample's output folder
	task{
		sys cp $fastqcPath/$sampleNoZip/Images/per_base_quality.png  $outputPerSamplePath/$unzipfq
		sys cp $fastqcPath/$sampleNoZip/summary.txt $outputPerSamplePath/$unzipfq
	} 																																		
	wait
	unzipfq = ''
}
wait


########################################################################################################
############      making output per sample for 16S marker gene & alignment based assignment   ##########
#########################################################################################################

genePath.chdir()
if  ( paramsOfTable{'gene'} == 'gene_16S'  && paramsOfTable{'taxonomyAssignmentMethod'} == 'alignment' ) {
	string sixteenPath = genePath + '/' + "gene_16S"
	string assignmentPath = sixteenPath + '/' + paramsOfTable{'taxonomyFolderName'}
	assignmentPath.chdir() 
	
	task {		
	sys sed "s/ /''/g" Richness.tsv > Richness2.tsv 
	sys sed "s/ /''/g" All_Cumulative.tsv > All_Cumulative2.tsv 
	sys sed "s/ /''/g" Relative_Abundance.tsv > Relative_Abundance2.tsv 
	}
wait

for ( string sample : names ) {

	wait
	sys awk '{print $2, $3, $sample}' Richness2.tsv > Richneness_$sample.tsv
	sys awk '{print $2, $3, $sample}' All_Cumulative2.tsv > All_Cumulative_$sample.tsv
	sys awk '{print $2, $3, $sample}' Relative_Abundance2.tsv > Relative_Abundance_$sample.tsv
	sys awk '{print $sample}' All_Cumulative2.tsv > All_Cumulative_only_int_$sample.tsv
	sys awk '{print $sample}' Relative_Abundance2.tsv > Relative_Abundance_only_int_$sample.tsv
	wait
	
	sys paste -d ' ' Richneness_$sample.tsv All_Cumulative_only_int_$sample.tsv Relative_Abundance_only_int_$sample.tsv > profile_$sample.tsv
	wait
	sys rm All_Cumulative_only_int_$sample.tsv Relative_Abundance_only_int_$sample.tsv
	wait
	sys mv profile_$sample.tsv Richneness_$sample.tsv  All_Cumulative_$sample.tsv  Relative_Abundance_$sample.tsv $outputPerSamplePath/$sample
}

sys rm Richness2.tsv All_Cumulative2.tsv   Relative_Abundance2.tsv 
}
 
###########################################################################################
##################      making output per sample for COI marker gene     ##################
###########################################################################################

if  ( paramsOfTable{'gene'} == 'gene_COI' &&  paramsOfTable{'clusteringAlgo'} == 'algo_SWARM' ){
	 #using string.readLines() i will create my otu table..
	string swarmPath = genePath + '/' + 'gene_COI' +  '/' + 'SWARM'
	print swarmPath + "\n"
	swarmPath.chdir()
	string[] allSamplesList
	string[] allContainsOfSwarm = swarmPath.dir()
	for ( string sample : allContainsOfSwarm ) {
		print "sample is " + sample + "\n"
		if ( sample.endsWith('.finalTriplesPerSampleFromSwarmClustering.txt') == true  ) {
			string sampleId = sample.split('.finalTriplesPerSampleFromSwarmClustering.txt')[0]
			print "sample is " + sampleId + "\n"
			sys mv $sample $outputPerSamplePath/$sampleId
		}
	}
	# here, the number of the SPECIES that are found from all samples, are computed
	sys bash $path/scripts/uniqSpecies.sh
}		


if  ( paramsOfTable{'gene'} == 'gene_COI' &&  paramsOfTable{'clusteringAlgo'} == 'algo_CROP' ){
	print ' EDO EINAI POU XOUME TO KENAKI MAS TO TELEUTAIO.....' +"\n"
	string cropPath = genePath + '/' + 'gene_COI' +  '/' + 'CROP'
	print cropPath + "\n"
	cropPath.chdir()
	string[] allSamplesList
	string[] allContainsOfCrop = cropPath.dir()
	for ( string sample : allContainsOfCrop ) {
		print "sample is " + sample + "\n"
		if ( sample.endsWith('.finalTriplesFromCropClustering.txt') == true  ) {
			string sampleId = sample.split('.finalTriplesFromCropClustering.txt')[0]
			print "sample is " + sampleId + "\n"
			sys mv $sample $outputPerSamplePath/$sampleId
		}
	}
	
	# here, the number of the SPECIES that are found from all samples, are computed
	#using string.readLines() i will create my otu table..  
	#string.readLines() 
}		


checkpoint "phyloseq.chp"
parameterFilePath.chdir()
string m = "parameters.tsv"
string{} paramForPhyloseq = readParameterFile(m)


######################################################################################
#############################    phyloseq time!!!     ####################################
######################################################################################

# In order to get a phyloseq analysis, a phylogenetic tree for the OTUs found, needs to be made. 
# For that purpose, PEMA creates a Multiple Sequence Alignment (MSA) using the MAFFT algorithm
# Then, using RAxML the tree is being built.

# Finally, the user has to provide PEMA with a "metadata.csv" file. This needs to be in the analysis folder, along with parameters.tsv

# Pay attention!
# You need to fulfill the variable with the silva version, into the "phyloseq_in_PEMA.R" file as well, in order to get the phyloseq analysis. 

if ( paramForPhyloseq{'gene'} == 'gene_16S' &&  paramForPhyloseq{'phyloseq'} == 'Yes' ) {
	
	outputFilePath.chdir()
	string	phyloseq_folder = "phyloseq_output"
	string assignmentPath = genePath + '/' + "gene_16S"  + '/'  + paramForPhyloseq{'taxonomyFolderName'}


	## if you want to use phyloseq functions that need a phylogenetic tree with the OTUs found, you need to build such a tree!
	if ( paramForPhyloseq{'tree'} == 'Yes' ) {
		
		assignmentPath.chdir()
		
		## at first, we align the sequences of the final OTUs using the MAFFT aligner
		task /usr/bin/mafft --auto --reorder --thread -1 Aligned_assignments.fasta > aligned_otus.fasta		
		
		wait
		
		print "mafft alignmnent has been concluded" + "\n"
		
		## to get a nice tree, we will run the RAxML-ng "check" algorithm in the first place, in oder to check our MSA.
		task $path/tools/raxml-ng/raxml-ng/bin/raxml-ng --check --msa aligned_otus.fasta --model GTR+G4 || true
		
		wait
		
		print " The check of the alignment is done!" + "\n"
				
		## and then I create my tree using RAxML	
		string test = 'aligned_otus.fasta.raxml.reduced.phy'
		if ( test.exists() == 'true' ) {
			
			task $path/tools/raxml-ng/raxml-ng/bin/raxml-ng --all --msa-format PHYLIP --msa aligned_otus.fasta.raxml.reduced.phy --tree pars{$paramForPhyloseq{'parsTrees'}} --bs-trees $paramForPhyloseq{'bootstrapTrees'} --model GTR+G4 --threads $paramForPhyloseq{'raxmlThreads'} --force
						
			wait
			
			sys mv aligned_otus.fasta.raxml.reduced.phy.raxml.bestTree aligned_otus.fasta.raxml.bestTree
			
		} else {
			
			print "the mafft alignment was good for raxml-ng" + "\n"
			
			task $path/tools/raxml-ng/raxml-ng/bin/raxml-ng --all --msa-format FASTA --msa aligned_otus.fasta --tree pars{$paramForPhyloseq{'parsTrees'}} --bs-trees $paramForPhyloseq{'bootstrapTrees'} --model GTR+G4 --threads $paramForPhyloseq{'raxmlThreads'} --force

			wait	
		}
		wait		
	}

	# check if phyloseq-output folder has already been made from previous run
	if ( phyloseq_folder.isDir() ) {
		
		print "a phyloseq-output folder has already been built\n"
		wait
		
	} else {
		# create a folder where the input/output of phyloseq will be located
		phyloseq_folder.mkdir()
	
	}			

	## finally, I have to copy all the necessary PEMA's output files, that phyloseq uses as input, to the phyloseq-output folder
	assignmentPath.chdir()
	task{
		
		## add the OTU-table found in the phyloseq folder and name it as the phyloseq script wants 
		sys cp 16S_otutab_$paramForPhyloseq{'taxonomyFolderName'}.txt $outputFilePath/phyloseq_output
		sys cd $outputFilePath/phyloseq_output
		sys mv 16S_otutab_$paramForPhyloseq{'taxonomyFolderName'}.txt 16S_otu_table.txt
		## add the metadata file in the phyloseq folder
		sys cp $path/metadata.csv $outputFilePath/phyloseq_output	
	}

	
	string test2 = "aligned_otus.fasta.raxml.bestTree"
	if ( test2.exists() == 'true' ) {
		task{
			sys cp aligned_otus.fasta.raxml.bestTree $outputFilePath/phyloseq_output
			sys cd $outputFilePath/phyloseq_output
			sys mv aligned_otus.fasta.raxml.bestTree OTUs_tree.tre	
		}			
	}
	
	# run the phyloseq script	
	string phyloseqPath = outputFilePath + '/' + "phyloseq_output"
	phyloseqPath.chdir()
	sys cp $path/phyloseq_in_PEMA.R .
	wait
	sys /mnt/big/R-3.5.2/bin/Rscript phyloseq_in_PEMA.R

	wait		
}


#######################################     phyloseq has been concluded    #######################################################


###  make a file where i will move all the checkpoints
parameterFilePath.chdir() ;
if ( paramForPhyloseq{'emptyCheckpoints'} == 'Yes' ) {
	string checkpointsDir = 'checkpoints_for_' + paramForPhyloseq{'outputFolderName'}
	outputFilePath.chdir()
	checkpointsDir.mkdir()
	#parameterFilePath.chdir()
	
	path.chdir()
	sys mv *.chp $outputFilePath/$checkpointsDir
}


# keep only one OTU-table for every analysis in the case of the 16S marker gene and the alignment based taxonomy assignment method
if ( paramForPhyloseq{'gene'} == '16S' &&  paramForPhyloseq{'taxonomyAssignmentMethod'} == 'alignment' ) {
	
	string assignmentPath = genePath + '/' + "gene_16S" + '/' + paramForPhyloseq{'taxonomyFolderName'}
	assignmentPath.chdir()
	sys cd 
	sys mv 16S_otu_table.txt 16S_otutab_$paramForPhyloseq{'taxonomyFolderName'}.txt
}

outputFilePath.chdir()
sys mv 4.merged_by_SPAdes 4.merged_by_PANDAseq
print(" PEMA has come to an end! Let biology start! ")