#!/usr/bin/env bds # This is PEMA! A metabarcoding pipenline for environmental DNA samples. # Author: Haris Zafeiropoulos # date: 2017-2018 # PEMA needs to be performed inside a container - Docker or Singularity. ############################################################################################# ######### FUNCTION IN ORDER TO READ parameters.tsv FILE EACH TIME ############# ############################################################################################# # first of all, read the parameters' file and keep all of them in a map (map is a dictionary for BDS) # in order to be able to make run from checkpoints, we made a function in order to read the parameter's file all the time string{} readParameterFile(string parameter_file) { # each time i need to change..... string parameters_file = parameter_file lines := parameters_file.readLines() string{} my_case for ( string line : lines ) { # I need a map with parameteres as keys and their values as values, hence i do not care about lines starting with "#" if ( line.startsWith("#") ) { continue } else { string pname, pvalue (pname, pvalue) = line.split('\t') # tab - split my_case { pname } = pvalue # keep 1st elemenet as key (parameter) and the second as its value (parameter's value) pname = '' pvalue = '' } } # return the map return( my_case ) } ############################################################################################### ################ set PEMA's environment to be able to be executed ##################### ############################################################################################### # read parameters file for the first time! string parameterFilePath = "/mnt/analysis/" parameterFilePath.chdir() # set a variable with the parameter file as its value string f = "parameters.tsv" string{} paramsFirstAssignment = readParameterFile(f) # keep as a variable the path where PEMA is found string path = "/home" # keep as a variable the path where the data are found string dataPath = "/mnt/analysis/mydata" # keep as a variable the path of a BLAST database string blastDatabasePath = "/mnt/databases" # PEMA will leave its output also in /mnt string outputPoint = "/mnt/analysis" # Quite IMPORTANT step! # if there are Trimlogs in the rawData file, PEMA does not run as it should. Hence, we remove them if there are any! # This happens when i re-run PEMA over the same dataset dataPath.chdir() ; # check if there is a TrimLog file and IF YES, then delete it sys if [ $(find "$dataPath" -name "TrimLog*") ] ; then rm TrimLog* ; fi # I create a file where the pipeline's output will be saved outputPoint.chdir() # Check if an output folder called with the same name as the one given in the "parameters" file, has already been created from previous runs if ( paramsFirstAssignment{'outputFolderName'}.isDir() ) { print "this ouput file already exists\n" } else { # if it is not, create a file with its name paramsFirstAssignment{'outputFolderName'}.mkdir() print "a new output files was just created!\n" } # take the outut folder name as a variable string analysisName = paramsFirstAssignment{'outputFolderName'} # here are all the directories as variables that i will use string qualityControl = "1.quality_control" ; string trimmomatic = "2.trimmomatic_output" ; string bayesHammer = "3.correct_by_BayesHammer" string spades = "4.merged_by_SPAdes" ; string dereplicate = "5.dereplicate_by_obiuniq" ; string linearized = "6.linearized_files" ; string geneDependent = "7.gene_dependent" string outputPerSample = "8.output_per_sample" ## and all the paths for each of them string outputFilePath = outputPoint + "/" + paramsFirstAssignment{'outputFolderName'} string fastqcPath = outputFilePath +"/" + qualityControl ; string trimoPath = outputFilePath +"/" + trimmomatic ; string bayesPath = outputFilePath +"/" + bayesHammer ; string spaPath = outputFilePath +"/" + spades ; string derePath = outputFilePath +"/" + dereplicate ; string linearPath = outputFilePath +"/" + linearized ; string genePath = outputFilePath +"/" + geneDependent ; string outputPerSamplePath = outputFilePath +"/" + outputPerSample # and create an output file for each pipeline step - if you run PEMA for the first time for a specific experiment outputFilePath.chdir() # make a list with all the directory names that can be found in the output directory string [] outputFiles = outputFilePath.dir() # if that list is empty, then create the directories mentioned above if ( outputFiles.isEmpty() == true ) { # create all the output files qualityControl.mkdir() ; trimmomatic.mkdir() ; bayesHammer.mkdir() ; spades.mkdir() ; dereplicate.mkdir() ; linearized.mkdir() ; geneDependent.mkdir() outputPerSample.mkdir() } # gene dependent directory genePath.chdir() paramsFirstAssignment{'gene'}.mkdir() wait #/////////// from this point, PEMA and its tools are about to start running /////////// ########################################################################################## ########################## 1. FastQC ###################################### ########################################################################################## # If there are Trimlogs in the rawData file, PEMA does not run as it should. Hence, we remove them if there are any! # I re-run this command because of FastQC and the way it reacts in multiple runs over the same folders dataPath.chdir() ; sys if [ $(find "$dataPath" -name "TrimLog*") ] ; then rm TrimLog* ; fi # make a copy of "parameters.tsv" file to the output folder. # this way the user will be able to get the parameters he used for analysis, anytime! sys cp $parameterFilePath/parameters.tsv $outputFilePath/parameters0f.$analysisName.tsv # now BLASTQC runs for all files string[] rawFiles = dataPath.dir() for ( string rawFile : rawFiles ) { print(rawFile) task $path/tools/fastqc/FastQC/fastqc --outdir $fastqcPath $dataPath/$rawFile } wait print("FastQC is completed!" + "\n") # now i create a checkpoint and read again the parameters file. # this way if i change any of my parameters, i can do only the steps from here and after with the new ones. if ( qualityControl.isEmpty() == false ) { checkpoint "trimming.chp" } parameterFilePath.chdir() string g = "parameters.tsv" string{} paramsTrimmoStep = readParameterFile(g) ########################################################################################### ########################### 2. Trimmomatic ############################### ########################################################################################### dataPath.chdir() ; # go to the folder which contains all the raw data and make a list (named 'data') with all the file names that you can find there string[] data = dataPath.dir() # set two variables readF and readR for the forward and reverse file of each sample string readF string readR int counter = 0 for ( string file : data ) { # here's the trick! i need to tell which file is the forward and which is the reverese. Hence, i try to split a suffix. string check = file.split("_2\.fastq\.gz")[0] # if the split does occure then i know that i have the second file of each sample (reverse) # that is because the 'file' variable will always have a '.fastq.gz' suffix. # hence, if there is no split, 'file' and 'check' variables are the same. if ( file == check ) { readF = file print("readF is: " + readF +"\n") # if there is a split, then they are different } else { readR = file print("readR is: " + readR +"\n") } wait if ( readF.isEmpty() == false && readR.isEmpty() == false ) { if ( paramsTrimmoStep{'maxInfo'} == 'Yes' ) { print("You hace selected Max INFO" + "\n") wait task java -jar $path/tools/Trimmomatic/Trimmomatic-0.38/trimmomatic-0.38.jar PE -threads $paramsTrimmoStep{'threadsTrimmomatic'} -trimlog TrimLog $dataPath/$readF $dataPath/$readR $trimoPath/filtered_max_$readF.1P.fastq.gz $trimoPath/filtered_max_$readF.1U.fastq.gz $trimoPath/filtered_max_$readR.2P.fastq.gz $trimoPath/filtered_max_$readR.2U.fastq.gz ILLUMINACLIP:/$path/tools/Trimmomatic-0.38/adapters/$paramsTrimmoStep{'adapters'}:$paramsTrimmoStep{'seedMismatches'}:$paramsTrimmoStep{'palindromeClipThreshold'}:$paramsTrimmoStep{'simpleClipThreshold'} LEADING:$paramsTrimmoStep{'leading'} TRAILING:$paramsTrimmoStep{'trailing'} MAXINFO:$paramsTrimmoStep{'targetLength'}:$paramsTrimmoStep{'strictness'} MINLEN:$paramsTrimmoStep{'minlen'} wait readF = '' readR = '' } else if ( paramsTrimmoStep{'maxInfo'} == 'No' ) { print("You have selected no Max INFO" + "\n") task java -jar $path/tools/Trimmomatic/Trimmomatic-0.38/trimmomatic-0.38.jar PE -threads $paramsTrimmoStep{'threadsTrimmomatic'} -trimlog TrimLog2 $dataPath/$readF $dataPath/$readR $trimoPath/filtered_$readF.1P.fastq.gz $trimoPath/filtered_$readF.1U.fastq.gz $trimoPath/filtered_$readR.2P.fastq.gz $trimoPath/filtered_$readR.2U.fastq.gz ILLUMINACLIP:/$path/tools/Trimmomatic-0.38/adapters/$paramsTrimmoStep{'adapters'}:$paramsTrimmoStep{'seedMismatches'}:$paramsTrimmoStep{'palindromeClipThreshold'}:$paramsTrimmoStep{'simpleClipThreshold'} LEADING:$paramsFirstAssignment{'leading'} TRAILING:$paramsTrimmoStep{'trailing'} MINLEN:$paramsTrimmoStep{'minlen'} wait readF = '' readR = '' } } } wait print "Trimmomatic is done\n\n" ; # let us make a CHECKPOINT from which will be able to restart our analysis from the error correction sterp path.chdir() ; trimoPath.chdir() ; if ( trimoPath.isEmpty() == false ) { checkpoint "errorCorrection.chp" } parameterFilePath.chdir() string h = "parameters.tsv" string{} paramsBayesHammerStep = readParameterFile(h) ################################################################################################ ################## 3. BayesHammer - error correction // SPAdes ############################ ################################################################################################ # make a list with all the file names that there are in the folder with Trimmomatic's # output -- e.g: suffix : '2.fastq.gz.2U.fastq.gz' trimoPath.chdir() string[] trimos = trimoPath.dir() for (string file : trimos ) { # we delete all the 'U.fastq.gz' files using the list we just made if ( file.endsWith("U\.fastq\.gz") ) { file.delete() } } # and then we make a new list with the files that remained string[] finalTrimos = trimoPath.dir() string readBayesF string readBayesR string newDir string newFile wait for ( string trimmed : finalTrimos ) { # as in Trimmomatic step, we need to know the forward and the reverse file string trimCheck = trimmed.split("_2\.fastq\.gz\.2P\.fastq.\gz")[0] if ( trimmed == trimCheck ) { readBayesF = trimmed } else { readBayesR = trimmed } if ( readBayesF.isEmpty() == false && readBayesR.isEmpty() == false ) { if ( paramsBayesHammerStep{'maxInfo'} == 'Yes' ) { string newName = trimCheck.split("filtered\_max\_") newDir = newName.substr(3,13) ; print("trimCheck: " + trimCheck + "\n" + "newDir: " + newDir + "\n") } else if ( paramsBayesHammerStep{'maxInfo'} == 'No' ) { string newName = trimCheck.split("filtered_") newDir = newName.substr(3,13) ; print("trimCheck: " + trimCheck + "\n" + "newDir: " + newDir + "\n") } # working where the output of BayesHammer is located! bayesPath.chdir() ; newDir.mkdir() ; # going back to Trimo's output trimoPath.chdir() wait # and I execute the BayesHammer algorithm that it is in SPAdes sys which gzip sys gzip --version print("readBayesF: " + readBayesF +"\n" + "and readBayesR: " + readBayesR + "\n" ) task $path/tools/SPAdes/SPAdes-3.13.0-Linux/bin/spades.py --only-error-correction -o $bayesPath/$newDir -1 $readBayesF -2 $readBayesR --disable-gzip-output wait # clear all variables that are used in BayesHammer step in order to be re-used readBayesF = '' readBayesR = '' newFile = '' newDir = '' } } wait print("Error correction using BayesHammer is completed!" + "\n") # let us make a checkpoint to start our pipeline from the merging step! if ( bayesPath.isEmpty() == false ) { checkpoint "mergingWithSpades.chp" } parameterFilePath.chdir() string i = "parameters.tsv" string{} paramsSpadesMerging = readParameterFile(i) ############################################################################################# ############################ 4. PANDAseq - merging/overlapping ########################## ############################################################################################# bayesPath.chdir() # keep in a list all files that came from the BayesHammer algo leaving aside some stats file # make a list with all the folders that exist in Bayes Hammer output folder - only sample names (e.g. SRR1643855 ) string[] correct = bayesPath.dir() correct.remove("input_dataset.yaml") ; correct.remove( "params.txt" ) ; correct.remove("spades.log") ; correct.remove( "tmp") # we need to "create" the names of forward and reverse files each time string baseName if ( paramsSpadesMerging{'maxInfo'} == 'Yes' ) { # we keep as 'baseName' the prefix of every file that depends on whether we had maxInfo or not baseName = "filtered_max_" } else { baseName = "filtered_" } for ( string correctFile : correct ) { string uniqPath = bayesPath + '/' + correctFile + '/' + 'corrected' ; # we enter to the directory of a specific sample each time in the output folder of BayesHammer - previous step uniqPath.chdir() ; sys /bin/gzip *.fastq # in the corrected file of each sample there are two files - one for forward, one for reverse - # and I make two variables with them string forward = "_1.fastq.gz.1P.fastq.00.0_0.cor.fastq.gz" string reverse = "_2.fastq.gz.2P.fastq.00.0_0.cor.fastq.gz" string forwardFile= baseName + correctFile + forward ; print(forwardFile + "\n") string reverseFile = baseName + correctFile + reverse ; print(reverseFile + "\n") wait # if i have sample names not in terms of Illumina and I need to use PANDAseq, i have to change the sample names in the rawdata as "ERR0001" for example. if ( paramsSpadesMerging{'EnaData'} == 'No') { uniqPath.chdir() sys bash $path/scripts/prepareForPandaseq.sh $forwardFile $reverseFile } # finally, we execute SPAdes algorithm for merging task $path/tools/PANDAseq/bin/pandaseq -f $forwardFile -r $reverseFile -6 -A $paramsSpadesMerging{'pandaseqAlgorithm'} -a -B -T $paramsSpadesMerging{'pandaseqThreads'} > $correctFile.merged.fastq.gz wait if ( paramsSpadesMerging{'EnaData'} == 'No') { string mergedFile = correctFile + ".merged.fastq.gz" sys $path/scripts/recoverSampleNames.sh $mergedFile } sys mv *.merged.fastq.gz $spaPath bayesPath.chdir() } wait # now that we have run SPAdes for merging for all our samples, # we unzip all my merged files in a new file named "unzip" # we go to the folder with the output of SPAdes and we make a list with its elements spaPath.chdir() ; string[] merged = spaPath.dir() ; string unzip = "unzip" ; # we make a new directory named 'unzip' unzip.mkdir() ; string unzipPath = "$spaPath/unzip" ; # and we remove the element 'unzip' from the list we made merged.remove("unzip") for ( string anyFile : merged ) { sys cp $anyFile $unzipPath unzipPath.chdir() ; string newFile = anyFile.baseName(".gz") sys mv $anyFile $newFile spaPath.chdir() } wait print("Merging step by SPAdes is completed" +"\n") # we make a checkpoint to start our pipeline from the dereplication step! if ( spaPath.isEmpty() == false ) { checkpoint "dereplicationStep.chp" } parameterFilePath.chdir() string j = "parameters.tsv" string{} paramsDereplication = readParameterFile(j) ########################################################################################################## ######################### 5. DEREPLICATE me ti voitheia ton OBITools !! #################### ######################################################################################################### unzipPath.chdir() ; string[] unzips = unzipPath.dir() string fileForDerepl for ( string nodereplicate : unzips ) { # now i execute OBITools task $path/tools/OBITools-1.2.12/bin/obiuniq -m sample $nodereplicate > $derePath/dereplicate_$nodereplicate ; wait } wait print("all the first steps are done! clustering is about to start!" +"\n") checkpoint "tableStep.chp" parameterFilePath.chdir() string k = "parameters.tsv" string{} paramsOfTable = readParameterFile(k) ############################################################################################################## ######################################### MERGING ALL FILES IN (1) ONE ############################ ############################################################################################################# derePath.chdir() # we make a list with all the file names that exist in the folder with the dereplication-step output string[] dereplicates = derePath.dir() for ( string derepl : dereplicates ) { # and we make some transforms in all these files task { # we substitue both 'merged_sample...' amd ';' with '_' and nothing respectively sys sed 's/ merged_sample={}; count=/_/g ; s/;//g' $derepl > se.$derepl # merge all lines of a fastq entry into one and only one line! sys awk 'NR==1 {print ; next} {printf /^>/ ? "\n"$0"\n" : $1} END {printf "\n"}' se.$derepl > nospace.$derepl # we turn all down- to upcases sys cat nospace.$derepl | tr '[a-z]' '[A-Z]' > $linearPath/linearized.$derepl } } wait derePath.chdir() sys rm se.* nospace.* # we go to the folder with the linearized data and make a list with all its contents linearPath.chdir() string[] linears = linearPath.dir() # we remove from these files some characteristic symbols for ( string sample : linears ) { task{ sys sed 's/\:\:\:/\:/g' $sample > $sample.f.fasta sys sed 's/\:0\:0\:0\://' $sample.f.fasta > $sample.g.fasta } } wait linearPath.chdir() sys rm *.f.fasta wait # this command forces applications to use the default language for output, and forces sorting to be bytewise. sys export LC_ALL=C # here I read all the files I have made at once and run over them an awk command in which I set as record separator ">" and as field separaton '_' or new line. # from the second line I make an array with the sample as keys and their abundances as value and anotherone with the otus as keys and the sequnece asw value. # finally, I make a file in which in every line i have the amplicon name (sample) its abundance and its sequence. task cat *.g.fasta | awk 'BEGIN {RS = ">" ; FS = "[_\n]"} {if (NR != 1) {abundances[$1] += $2 ; sequences[$1] = $3}} END {for (amplicon in sequences) {print ">" amplicon "_" abundances[amplicon] "_" sequences[amplicon]}}' | sort --temporary-directory=$(pwd) -t "_" -k2,2nr -k1.2,1d | sed -e 's/\_/\n/2' > $outputFilePath/all_samples.fasta wait task awk '/N/{n=2}; n {n--; next}; 1' < $outputFilePath/all_samples.fasta > $outputFilePath/final_all_samples.fasta wait sys rm *.g.fasta ###################################################################################################################################### ## in the file 'final_all_samples.fasta' we now have all the reads from all our samples and we are able to move on CLUSTERING - STEP ## however, even if until now the procedure was the same no matter what gene we had, ## we now have to choose between different paths depending on the gene and our needs ###################################################################################################################################### # depending on what marker gene and kind of clustering algorithm (for the COI case) we have, we make the right folders genePath.chdir() if ( paramsOfTable{'gene'} == 'gene_COI' ) { string coiPath = genePath + '/' + 'gene_COI' coiPath.chdir() if ( paramsOfTable{'clusteringAlgo'} == 'algo_CROP' ) { string algo = 'CROP' algo.mkdir() } else if ( paramsOfTable{'clusteringAlgo'} == 'algo_SWARM' ) { string algo = 'SWARM' algo.mkdir() } } ################################################################################################################### ################################################## 16S CASE ############################################### ################################################################################################################### if ( paramsOfTable{'gene'} == 'gene_16S' ) { print " my marker gene is 16S\n " ## chimera removal & OTU clustering with VSEARCH - the algorithm "-cluster_otus" execute BOTH chimera removal and clustering string sixteenPath = genePath + '/' + "gene_16S" outputFilePath.chdir() task sed 's/_/;size=/g' final_all_samples.fasta > $sixteenPath/16S_all_samples.fasta wait ###################### here i execute the VSEARCH algorithms ############################################# sixteenPath.chdir() # demultiplexing task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --threads $paramsOfTable{'vsearchThreads'} --derep_fulllength 16S_all_samples.fasta --minuniquesize 2 --sizein --sizeout --fasta_width 0 --uc all.derep.uc --output all.derep.fasta wait # execute the first clustering step task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --threads $paramsOfTable{'vsearchThreads'} --cluster_size all.derep.fasta --id $paramsOfTable{'vsearchId'} --strand both --sizein --sizeout --fasta_width 0 --uc all.preclustered.uc --centroids all.preclustered.fasta wait # chimera removal task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --threads $paramsOfTable{'vsearchThreads'} --uchime_denovo all.preclustered.fasta --sizein --sizeout --fasta_width 0 --nonchimeras all.denovo.nonchimeras.fasta wait # change an output file name sys mv all.denovo.nonchimeras.fasta all.ref.nonchimeras.fasta # here are two perl scripts that are located in the "scripts" folder task $path/scripts/map_vsearch.pl all.derep.fasta all.preclustered.uc all.ref.nonchimeras.fasta > all.nonchimeras.derep.fasta wait task $path/scripts/map_vsearch.pl 16S_all_samples.fasta all.derep.uc all.nonchimeras.derep.fasta > all.nonchimeras.fasta wait # execute the second clustering step task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --threads $paramsOfTable{'vsearchThreads'} --cluster_size all.nonchimeras.fasta --id 0.97 --strand both --sizein --fasta_width 0 --uc all.clustered.uc --relabel Otu --centroids all.otus.fasta --otutabout all.otutab.txt wait sys mv all.otus.fasta 16S_all_samples.otus.fa sys mv all.otutab.txt 16S_otutab.txt sys cp 16S_otutab.txt 16S_otutab_$paramsOfTable{'taxonomyFolderName'}.txt ####### finally i execute the TAXONOMY ASSIGNMENT step using CREST and SILVA database (alignment based) ########### if ( paramsOfTable{'taxonomyAssignmentMethod'} == 'alignment' ) { sixteenPath.chdir() if ( paramsOfTable{'silvaVersion'} == 'silva_128' ) { # first we use megablast algorithm of blastn for sequence identification, intra-species comparison task $path/tools/ncbi-blast-2.8.1+/bin/blastn -task megablast -query 16S_all_samples.otus.fa -db $path/tools/CREST/LCAClassifier/parts/flatdb/silvamod/silvamod128.fasta -num_alignments 100 -outfmt 5 -out 16S_mysilvamod128_$paramsOfTable{'taxonomyFolderName'}.xml -num_threads 10 wait # and then we use the classifier in order to get our assignment task $path/tools/CREST/LCAClassifier/bin/classify -t 16S_otutab_$paramsOfTable{'taxonomyFolderName'}.txt -o $paramsOfTable{'taxonomyFolderName'} 16S_mysilvamod128_$paramsOfTable{'taxonomyFolderName'}.xml # we follow the same steps for the case of the Silva 132 } else if ( paramsOfTable{'silvaVersion'} == 'silva_132' ) { task $path/tools/ncbi-blast-2.8.1+/bin/blastn -task megablast -query 16S_all_samples.otus.fa -db $path/tools/CREST/LCAClassifier/parts/flatdb/silva132/silva132 -num_alignments 100 -outfmt 5 -out 16S_mysilvamod132_$paramsOfTable{'taxonomyFolderName'}.xml -num_threads 10 wait task $path/tools/CREST/LCAClassifier/bin/classify -c $path/tools/CREST/LCAClassifier/parts/etc/lcaclassifier.conf -d silva132 -t 16S_otutab_$paramsOfTable{'taxonomyFolderName'}.txt -o $paramsOfTable{'taxonomyFolderName'} 16S_mysilvamod132_$paramsOfTable{'taxonomyFolderName'}.xml wait } ## what about another way for taxonomy assignment for the case of the 16S marker gene? ## here is the phylogeny-based approach that PEMA supports, using RAxML-ng and SILVA (v.128) ## } else if ( paramsOfTable{'taxonomyAssignmentMethod'} == 'phylogeny' ) { ## we need a REFERENCE TREE which we generated it from SILVA database with the aid of GAPPA. ## we used GAPPA in order to keep only 10.000 taxa from SILVA. ## we then used RAXML - NG which it returnerd us our REF TREE - you can find the corresponding script for getting this tree in $path/tools/silva_132/for_placement ## in order to get our PHYLOGENY based taxonomy assignment we make use of EPA - NG // last version of Evolutionary Placement Algorithm # PaPaRa returns a msa of our queries that has taken into account info from our ref tree task $path/tools/PaPaRa/papara_static_x86_64 -t $path/tools/createTreeTheEasyWay/raxml_easy_right_refmsa.raxml.bestTree -s $path/tools/createTreeTheEasyWay/raxml_easy_right_refmsa.raxml.reduced.phy -q $genePath/gene_16S/16S_all_samples.otus.fa -r wait # we remove the ref - msa - THIS IS A BUG OF EPA-ng !!!! sys tail -n +1038 $sixteenPath/papara_alignment.default > $sixteenPath/papara_alignment.phy wait # we convert our query msa to fasta format with a script we made because epa-ng can handle only fasta sixteenPath.chdir() task bash $path/scripts/convertPhylipToFasta.sh papara_alignment.phy wait # execute epa-ng algorithm (evolutinary placement) in order to get our phylogenetic-based taxonomy assignment, at last! task $path/tools/EPA-ng/epa/bin/epa-ng -s $path/tools/createTreeTheEasyWay/raxml_easy_right_refmsa.raxml.reduced.phy.fasta -t $path/tools/createTreeTheEasyWay/raxml_easy_right_refmsa.raxml.bestTree -q $genePath/gene_16S/papara_alignment.fasta -w $genePath/gene_16S/ --model GTR{0.959447/2.52497/1.45842/1.06213/3.55143/1}+FU{0.237221/0.230211/0.294337/0.238231}+G4m{0.559838} wait print("phylogenetic based taxonomy assignemnt worked fine!") string assignFolder="16S_taxon_assign_phylogeny_assignment" sixteenPath.chdir() assignFolder.mkdir() sys mv epa* $assignFolder } } #################################################################################################### ####################################### COI CASE ######################################## #################################################################################################### ### in the COI case, we will have an IF statement for the two different clustering algos and THEN, after, we will do the taxonomy assignment!!! if ( paramsOfTable{'gene'} == 'gene_COI' ) { print 'my marker gene is COI' + "\n" ## in this case, there is not an algo that does both chimera removal and clustering, hence we have a 2 - step procedure ## an kai stin arxi eixame pei prota chimera removal kai meta otu clustering, telika pame antistrofa ###################### CROP or SWARM ################################## if ( paramsOfTable{'clusteringAlgo'} == 'algo_SWARM' ) { #### Prepare files outputFilePath.chdir() string swarmPath = genePath + '/' + 'gene_COI' + '/' + 'SWARM' task cp final_all_samples.fasta $swarmPath/COI_SWARM_all_samples.fasta wait ##### SWARM #### ###### we build a contingency table for AMPLICONS - here the awk.sh file is a bash script that you can find in the PEMA file linearPath.chdir() task{ sys bash $path/scripts/createContigencyTable.sh sys sed 's/linearized.dereplicate_//g; s/.merged.fastq//g' amplicon_contingency_table_temp.csv > amplicon_contingency_table.csv sys mv amplicon_contingency_table.csv $swarmPath } wait print 'the contigency table has been created successfully!' + "\n" ##### Run SWARM and then chimera removal swarmPath.chdir() ## here is the clustering step with SWARM algorithm if ( paramsOfTable{'d'} == '1' ) { task $path/tools/swarm/swarm/bin/swarm -d 1 -f -t 10 -s SWARM.stats -w SWARM_OTU_representatives_all_samples.fasta < COI_SWARM_all_samples.fasta > SWARM.swarms } else { task $path/tools/swarm/swarm/bin/swarm -d $paramsOfTable{'d'} -t 40 -s SWARM.stats -w SWARM_OTU_representatives_all_samples.fasta < COI_SWARM_all_samples.fasta > SWARM.swarms } wait print('the swarm algorithm worked just fine!' + "\n") # and now i handle its output in order to take it in the form i want to. turn lowcase to uppercase and substitute the '_' to 'size=' task { sys cat SWARM_OTU_representatives_all_samples.fasta | tr '[a-z]' '[A-Z]' > upper.SWARM_OTU_representatives_all_samples.fasta sys sed 's/_/;size=/g' upper.SWARM_OTU_representatives_all_samples.fasta > true.SWARM_OTU_representatives_all_samples.fasta } wait ## 'sortbysize' is an algorithm for sorting by size (abundance) task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --sortbysize true.SWARM_OTU_representatives_all_samples.fasta --output SWARM_final_OTU_representative.fasta wait print('sortbysize algorithm ran ok!' + "\n") ## and the 'uchime3_denovo' is an algorithm for finding chimeras task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --uchime3_denovo SWARM_final_OTU_representative.fasta --nonchimeras SWARM_otu_no_chimera.fasta --abskew $paramsOfTable{'abskew'} wait print('the uchime3_denovo algorithm for finding chimeras just finished!' + "\n") string[] swarmInterFIles = swarmPath.dir() for (string file : swarmInterFIles ) { if ( file.startsWith('upper') ) { file.delete() } if ( file.startsWith('true') ) { file.delete() } } ##### at last, we create an OTU table that comes after the SWARM results. Again, we have an awk command in a bash script swarmPath.chdir() sys mv $linearPath/amplicon_contingency_table_temp.csv $swarmPath wait sys bash $path/scripts/createOtuContigencyTable.sh print('the contigency table with the found otus was just created!' + "\n") } #### here ends ths "if" statement of SWARM clustering ( ATTENTION TO HAVE 1 TAB HERE !! ) #################################### here is another way for CLUSTERING on case of COI #################################### if ( paramsOfTable{'clusteringAlgo'} == 'algo_CROP' ) { string croPath = genePath + '/' + 'gene_COI' + '/' + 'CROP' #### CROP ##### ## In this case we do chimera removal first and then we apply the clustering algorithm ---------- Until now (23/3) we think we prefer CROP over SWARM outputFilePath.chdir() task sed 's/_/;size=/g' final_all_samples.fasta > COI_CROP_all_samples.fasta wait sys cp COI_CROP_all_samples.fasta $croPath wait croPath.chdir() ### chimera removal task $path/tools/VSEARCH/vsearch-2.9.1-linux-x86_64/bin/vsearch --uchime3_denovo COI_CROP_all_samples.fasta --nonchimeras no_chim_COI_CROP_all_samples.fasta --abskew $paramsOfTable{'abskew'} wait ### calculate parameters for CROP clustering algorithm croPath.chdir() sys bash $path/scripts/croParameters.sh wait print 'The clustering of the MOTUs by the CROP algorithm, has been concluded!' + "\n" } } ####################################################################################################### ########## TAXONOMY ASSIGNMENT for the COI case using RDPClassifier (alignment based) ########## ####################################################################################################### if ( paramsOfTable{'gene'} == 'gene_COI' && paramsOfTable{'clusteringAlgo'} == 'algo_SWARM' ) { #### SWARM ##### string swarmPath = genePath + '/' + 'gene_COI' + '/' + 'SWARM' swarmPath.chdir() ### I remove all new lines and make each sequence an one ana only line kai kano oli tin allilouxia mia grammi task awk 'NR==1 {print ; next} {printf /^>/ ? "\n"$0"\n" : $1} END {printf "\n"}' SWARM_otu_no_chimera.fasta > nospace_swarm.fasta wait print('nospace_swarm is fine!' +"\n") ### I remove all singletons task sed '/size=1/,+1 d' nospace_swarm.fasta > no_singletons.fasta sys bash $path/scripts/removeSingletons.sh wait print('no_singletons file is fine' + "\n") ### I run RDPClassifier - train - step has already be done and as reference database we use MIDORI task java -Xmx64g -jar $path/tools/RDPTools/classifier.jar classify -t $path/tools/RDPTools/TRAIN/rRNAClassifier.properties -o tax_assign_swarm_COI_temp.txt no_singletons.fasta wait task{ sys sed 's/.*root\t1.0\t//g' tax_assign_swarm_COI_temp.txt > taxonomies.txt sys awk '{print $1}' tax_assign_swarm_COI_temp.txt > ids.txt sys paste ids.txt taxonomies.txt > tax_assign_swarm_COI.txt sys rm ids.txt taxonomies.txt tax_assign_swarm_COI_temp.txt } print 'rdpclassifier has ran as it should ' + "\n" swarmPath.chdir() string noSpaceFile = 'nospace_swarm.fasta' string noSingletonsFile = 'no_singletons.fasta' noSpaceFile.delete() # noSingletonsFile.delete() wait ## we keep in a new file, only the otus that have above 97% similartity with some species and in another just the uniq species swarmPath.chdir() #sys bash $path/scripts/keepOnlyPerCentSimilartityForSwarm.sh print 'all the taxonomy assignment procedure for the SWARM case has been completed!' + "\n" } ######################################### CROP case !!!! ################################################## wait if ( paramsOfTable{'gene'} == 'gene_COI' && paramsOfTable{'clusteringAlgo'} == 'algo_CROP' ) { string croPath = genePath + '/' + 'gene_COI' + '/' + 'CROP' croPath.chdir() print 'The taxonomy assignment with the CROP output has STARTED! ' ## intermidate file task { sys cat CROP_output.cluster.fasta | grep ">" > heads.txt sys sed 's/=[0-9]*/=/g' heads.txt > heads2.txt sys cat CROP_output.cluster.fasta | grep -v ">" > seqs.fasta sys awk '{print $2}' CROP_output.cluster > abundancies.txt sys tail -n +2 abundancies.txt > abundancies2.txt sys paste heads2.txt abundancies2.txt > new_heads.txt sys sed 's/\t//g' new_heads.txt > final_heads.txt sys paste -d'\n' final_heads.txt seqs.fasta > crop_for_rdpclassifier.fasta sys rm heads.txt heads2.txt seqs.fasta abundancies.txt abundancies2.txt new_heads.txt final_heads.txt } print 'I have made the file I need for the taxonomy assignment' + "\n" ## and now RDPClassifier can be performed croPath.chdir() task java -Xmx64g -jar $path/tools/RDPTools/classifier.jar classify -t $path/tools/RDPTools/TRAIN/rRNAClassifier.properties -o taxonomy_from_rdpclassifer_COI_CROP.txt crop_for_rdpclassifier.fasta croPath.chdir() sys bash $path/scripts/keepOnlyPerCentSimilartityForCrop.sh } ######################################################################################################################## ############################## here starts the 'makeOtuTable' script ... take as much information as possible ################ ######################################################################################################################## string fileFromRdpClassifier string textFinal string perSampleTriples # make as variables the files that will be used and created, depending on the clustering algo if ( paramsOfTable{'gene'} == 'gene_COI' && paramsOfTable{'clusteringAlgo'} == 'algo_SWARM' ) { string swarmPath = genePath + '/' + 'gene_COI' + '/' + 'SWARM' fileFromRdpClassifier = 'tax_assign_swarm_COI.txt' textFinal = 'finalTriplesFromSwarmClustering.txt' perSampleTriples = 'finalTriplesPerSampleFromSwarmClustering.txt' swarmPath.chdir() } else if ( paramsOfTable{'gene'} == 'gene_COI' && paramsOfTable{'clusteringAlgo'} == 'algo_CROP' ) { string croPath = genePath + '/' + 'gene_COI' + '/' + 'CROP' fileFromRdpClassifier = 'taxonomy_from_rdpclassifer_COI_CROP.txt' textFinal = 'finalTriplesFromCropClustering.txt' perSampleTriples = 'finalTriplesPerSampleFromCropClustering.txt' croPath.chdir() } wait # we check whether the output of the classifier is non-zero if ( paramsOfTable{'gene'} == 'gene_COI' && fileFromRdpClassifier.size() != 0 ) { print ' we have a RDPClassifier output file to work with!!! ' bool debugCode = false string{} giveSampleAndTaxonomyGetSizeSum string{} sampleIdentifiers #set of all samples encountered string{} taxaNames #set of all assigned taxa ##### from output file from both SWARM and CROP clustering algorithms and RDPClassifier afterwards, I get an 'assignment.txt' file, from which i get all the information i need ##### here, i keep only the taxonomy that is from a threshold and above ( e.g >97%) and i merge all otus with the same taxonomy in the same sample in one entry ##### finally, i sum up the sizes of the otus i merged string[] lines = fileFromRdpClassifier.readLines() for ( string line : lines ) { # input file looks like: #ERR1308250:22628;size=4349_TAB__TAB_root_TAB_root_TAB_1Eukaryota superkingdom 1 Arthropoda phylum... #0 1 2 3 4 5 6 7 8 9 10... #PARSE SKIP SKIP SKIP PARSE PARSE SKIP PARSE PARSE SKIP... print line + "\n" string[] lineElements = line.split("\t") string sample = lineElements[0].split(':')[0] #parse 1st column (index 0), get first component string sizeAsString = lineElements[0].split('=')[1] #parse 1st column (index 0), get last component int size = sizeAsString.parseInt() if ( debugCode ) { print "TEST1\t "+ sample + "\t" + size + "\n" } string thereIsMoreToRead = 'true' #current triplet refers to e.g. "Eukaryota superkingdom 1" columns with index 1,2,3 int currentTripletStartColumn=1 #start parsing now from the 2nd column, ie. with index 1 string assigned_taxo="" string assignmentConfidence=0 while ( thereIsMoreToRead == 'true' ) { assigned_taxo = lineElements[currentTripletStartColumn] #read column with index 5, 8, 11... if ( debugCode ) { print "t1:" + currentTripletStartColumn + ":" + assigned_taxo + "\n" } #columns with index 6, 9, 12,... are skipped #read column with index 7, 10, 13... assignmentConfidence = lineElements[currentTripletStartColumn + 2] print "the assigned taxo is " + assigned_taxo + "\n" print "assignmentConfidence is " + assignmentConfidence + "\n" print "size is equal to: " + size + "\n" if ( debugCode ) { print "t2:" + currentTripletStartColumn +":" +assignmentConfidence + "\n" } #if no next taxon triple follows return int nextTripletConfidenceColumn = currentTripletStartColumn + 5 if ( debugCode ) { print "t3:" + currentTripletStartColumn + " index out of max "+ lineElements.size() - 1 +"\n" } #inspect column with index 8, 11, 14... for existence if ( nextTripletConfidenceColumn > lineElements.size() ) { #the whole table has been read and the species level reached if ( debugCode ) { print "t4: column with index " + nextTripletConfidenceColumn + " does not exist\n" } thereIsMoreToRead = 'false' continue; } #if the confience of the next taxon is below confidence return if ( lineElements[nextTripletConfidenceColumn] < 0.97 ) { if ( debugCode ) { print "t5:" + nextTripletConfidenceColumn + "\n" } thereIsMoreToRead = 'false' continue; } currentTripletStartColumn = currentTripletStartColumn + 3 #move to next taxon triple } #end of while to read taxa and confidence levels if ( debugCode ) { print "TEST2\t "+ sample + "\t" + assigned_taxo + "\t" + assignmentConfidence + "\t" + size + "\n" } ####################### till here, my code finds the taxon level #################################### sampleIdentifiers { sample } = 1; taxaNames { assigned_taxo } = 1; wait #sum size to any previous size sum corresponding to SampleAndTaxonomy pair if ( giveSampleAndTaxonomyGetSizeSum.hasKey(sample+'_SEP_'+assigned_taxo ) == 'true') { int newSum = giveSampleAndTaxonomyGetSizeSum { sample+'_SEP_'+assigned_taxo }.parseInt() print("size is " + size) newSum += size print("new sum is " + newSum) giveSampleAndTaxonomyGetSizeSum { sample+'_SEP_'+assigned_taxo } = newSum } else { giveSampleAndTaxonomyGetSizeSum { sample+'_SEP_'+assigned_taxo } = size } if ( debugCode ) { print "TEST3\t"+ sample+'_'+assigned_taxo + "-" + giveSampleAndTaxonomyGetSizeSum{ sample+'_'+assigned_taxo } +"\n" } } #end of each input file line wait if ( debugCode ) { print giveSampleAndTaxonomyGetSizeSum } #print output based on the giveSampleAndTaxonomyGetSizeSum map string sampleToPrint string sizeSumToPrint string taxonomyToPrint wait for(string sampleAndTaxon : giveSampleAndTaxonomyGetSizeSum.keys() ) { sampleToPrint = sampleAndTaxon.split('_SEP_')[0] sizeSumToPrint = giveSampleAndTaxonomyGetSizeSum{ sampleAndTaxon } taxonomyToPrint = sampleAndTaxon.split('_SEP_')[1] sys echo '$sampleToPrint $sizeSumToPrint $taxonomyToPrint ' >> $textFinal ######## to lathos (pou paizei gia tin ora) vrisketai sti dimiourgia tou textFinal, diladi tou finalTriplesFromSwarmClustering.txt kai oxi tou PER SAMPLE !! } if ( debugCode ) { print taxaNames print sampleIdentifiers } ## double for-loop with all the sample id and taxa name possible combinations printing the real ones, ## the ones that have a value that is string sizeToPrintPerSample for (string sampleId : sampleIdentifiers.keys()) { for ( string currentTaxon : taxaNames.keys() ){ ## print only for the sample and taxonomy pairs that have a size if ( giveSampleAndTaxonomyGetSizeSum.hasKey( sampleId+"_SEP_"+currentTaxon)) { sizeToPrintPerSample = giveSampleAndTaxonomyGetSizeSum { sampleId+"_SEP_"+currentTaxon } sys echo '$sampleId $sizeToPrintPerSample $currentTaxon' >> $sampleId.$perSampleTriples } } } print 'i did just find with the tripltes! ' + "\n" wait # finaly, make an OTU table only with the significant assignments (>97%) if ( paramsOfTable{'clusteringAlgo'} == 'algo_SWARM' ) { string swarmPath = genePath + '/' + 'gene_COI' + '/' + 'SWARM' swarmPath.chdir() sys bash $path/scripts/createOtuTableOnlyForTaxonomyAssigned.sh } if ( paramsOfTable {'clusteringAlgo'} == 'algo_CROP') { string cropPath = genePath + '/' + 'gene_COI' + '/' + 'CROP' cropPath.chdir() sys bash $path/scripts/createOtuTableOnlyForTaxonomyAssigned.sh } } print ' At last! We now have the MOTU-table! :) ' + "\n" wait ################################### here ends 'make0tuTable' script !!! ############################################### #//////////////////////////////////////////////////////////////////////////////////////////////////////// ############################################################################################################################## #### now, i have everything i need to, so I make files per sample in which i keep only the info I consider as valuable ! #### ############################################################################################################################## bayesPath.chdir() string[] names = bayesPath.dir() outputPerSamplePath.chdir() # for each sample we make a folder with its name, in order to put PEMA's output concerning each of those for ( string sample : names ) { sample.mkdir() } fastqcPath.chdir() # in 'samples' we make a list with all file names we find in fastqc output folder - that means for all the files string[] samples = fastqcPath.dir() # we remove all .html files from our list for (string file : samples ) { if ( file.endsWith(".html") ) { samples.remove(file) } } #set variables string unzipfq string sampleNoZip fastqcPath.chdir() for ( string sample : samples ) { # we unzip the output of fastqc for each file task unzip $fastqcPath/$sample sampleNoZip = sample.split('.zip')[0] unzipfq = sample.split('_[0-9]_fastqc')[0] wait # we copy two significant output files of FastQC to the sample's output folder task{ sys cp $fastqcPath/$sampleNoZip/Images/per_base_quality.png $outputPerSamplePath/$unzipfq sys cp $fastqcPath/$sampleNoZip/summary.txt $outputPerSamplePath/$unzipfq } wait unzipfq = '' } wait ######################################################################################################## ############ making output per sample for 16S marker gene & alignment based assignment ########## ######################################################################################################### genePath.chdir() if ( paramsOfTable{'gene'} == 'gene_16S' && paramsOfTable{'taxonomyAssignmentMethod'} == 'alignment' ) { string sixteenPath = genePath + '/' + "gene_16S" string assignmentPath = sixteenPath + '/' + paramsOfTable{'taxonomyFolderName'} assignmentPath.chdir() task { sys sed "s/ /''/g" Richness.tsv > Richness2.tsv sys sed "s/ /''/g" All_Cumulative.tsv > All_Cumulative2.tsv sys sed "s/ /''/g" Relative_Abundance.tsv > Relative_Abundance2.tsv } wait for ( string sample : names ) { wait sys awk '{print $2, $3, $sample}' Richness2.tsv > Richneness_$sample.tsv sys awk '{print $2, $3, $sample}' All_Cumulative2.tsv > All_Cumulative_$sample.tsv sys awk '{print $2, $3, $sample}' Relative_Abundance2.tsv > Relative_Abundance_$sample.tsv sys awk '{print $sample}' All_Cumulative2.tsv > All_Cumulative_only_int_$sample.tsv sys awk '{print $sample}' Relative_Abundance2.tsv > Relative_Abundance_only_int_$sample.tsv wait sys paste -d ' ' Richneness_$sample.tsv All_Cumulative_only_int_$sample.tsv Relative_Abundance_only_int_$sample.tsv > profile_$sample.tsv wait sys rm All_Cumulative_only_int_$sample.tsv Relative_Abundance_only_int_$sample.tsv wait sys mv profile_$sample.tsv Richneness_$sample.tsv All_Cumulative_$sample.tsv Relative_Abundance_$sample.tsv $outputPerSamplePath/$sample } sys rm Richness2.tsv All_Cumulative2.tsv Relative_Abundance2.tsv } ########################################################################################### ################## making output per sample for COI marker gene ################## ########################################################################################### if ( paramsOfTable{'gene'} == 'gene_COI' && paramsOfTable{'clusteringAlgo'} == 'algo_SWARM' ){ #using string.readLines() i will create my otu table.. string swarmPath = genePath + '/' + 'gene_COI' + '/' + 'SWARM' print swarmPath + "\n" swarmPath.chdir() string[] allSamplesList string[] allContainsOfSwarm = swarmPath.dir() for ( string sample : allContainsOfSwarm ) { print "sample is " + sample + "\n" if ( sample.endsWith('.finalTriplesPerSampleFromSwarmClustering.txt') == true ) { string sampleId = sample.split('.finalTriplesPerSampleFromSwarmClustering.txt')[0] print "sample is " + sampleId + "\n" sys mv $sample $outputPerSamplePath/$sampleId } } # here, the number of the SPECIES that are found from all samples, are computed sys bash $path/scripts/uniqSpecies.sh } if ( paramsOfTable{'gene'} == 'gene_COI' && paramsOfTable{'clusteringAlgo'} == 'algo_CROP' ){ print ' EDO EINAI POU XOUME TO KENAKI MAS TO TELEUTAIO.....' +"\n" string cropPath = genePath + '/' + 'gene_COI' + '/' + 'CROP' print cropPath + "\n" cropPath.chdir() string[] allSamplesList string[] allContainsOfCrop = cropPath.dir() for ( string sample : allContainsOfCrop ) { print "sample is " + sample + "\n" if ( sample.endsWith('.finalTriplesFromCropClustering.txt') == true ) { string sampleId = sample.split('.finalTriplesFromCropClustering.txt')[0] print "sample is " + sampleId + "\n" sys mv $sample $outputPerSamplePath/$sampleId } } # here, the number of the SPECIES that are found from all samples, are computed #using string.readLines() i will create my otu table.. #string.readLines() } checkpoint "phyloseq.chp" parameterFilePath.chdir() string m = "parameters.tsv" string{} paramForPhyloseq = readParameterFile(m) ###################################################################################### ############################# phyloseq time!!! #################################### ###################################################################################### # In order to get a phyloseq analysis, a phylogenetic tree for the OTUs found, needs to be made. # For that purpose, PEMA creates a Multiple Sequence Alignment (MSA) using the MAFFT algorithm # Then, using RAxML the tree is being built. # Finally, the user has to provide PEMA with a "metadata.csv" file. This needs to be in the analysis folder, along with parameters.tsv # Pay attention! # You need to fulfill the variable with the silva version, into the "phyloseq_in_PEMA.R" file as well, in order to get the phyloseq analysis. if ( paramForPhyloseq{'gene'} == 'gene_16S' && paramForPhyloseq{'phyloseq'} == 'Yes' ) { outputFilePath.chdir() string phyloseq_folder = "phyloseq_output" string assignmentPath = genePath + '/' + "gene_16S" + '/' + paramForPhyloseq{'taxonomyFolderName'} ## if you want to use phyloseq functions that need a phylogenetic tree with the OTUs found, you need to build such a tree! if ( paramForPhyloseq{'tree'} == 'Yes' ) { assignmentPath.chdir() ## at first, we align the sequences of the final OTUs using the MAFFT aligner task /usr/bin/mafft --auto --reorder --thread -1 Aligned_assignments.fasta > aligned_otus.fasta wait print "mafft alignmnent has been concluded" + "\n" ## to get a nice tree, we will run the RAxML-ng "check" algorithm in the first place, in oder to check our MSA. task $path/tools/raxml-ng/raxml-ng/bin/raxml-ng --check --msa aligned_otus.fasta --model GTR+G4 || true wait print " The check of the alignment is done!" + "\n" ## and then I create my tree using RAxML string test = 'aligned_otus.fasta.raxml.reduced.phy' if ( test.exists() == 'true' ) { task $path/tools/raxml-ng/raxml-ng/bin/raxml-ng --all --msa-format PHYLIP --msa aligned_otus.fasta.raxml.reduced.phy --tree pars{$paramForPhyloseq{'parsTrees'}} --bs-trees $paramForPhyloseq{'bootstrapTrees'} --model GTR+G4 --threads $paramForPhyloseq{'raxmlThreads'} --force wait sys mv aligned_otus.fasta.raxml.reduced.phy.raxml.bestTree aligned_otus.fasta.raxml.bestTree } else { print "the mafft alignment was good for raxml-ng" + "\n" task $path/tools/raxml-ng/raxml-ng/bin/raxml-ng --all --msa-format FASTA --msa aligned_otus.fasta --tree pars{$paramForPhyloseq{'parsTrees'}} --bs-trees $paramForPhyloseq{'bootstrapTrees'} --model GTR+G4 --threads $paramForPhyloseq{'raxmlThreads'} --force wait } wait } # check if phyloseq-output folder has already been made from previous run if ( phyloseq_folder.isDir() ) { print "a phyloseq-output folder has already been built\n" wait } else { # create a folder where the input/output of phyloseq will be located phyloseq_folder.mkdir() } ## finally, I have to copy all the necessary PEMA's output files, that phyloseq uses as input, to the phyloseq-output folder assignmentPath.chdir() task{ ## add the OTU-table found in the phyloseq folder and name it as the phyloseq script wants sys cp 16S_otutab_$paramForPhyloseq{'taxonomyFolderName'}.txt $outputFilePath/phyloseq_output sys cd $outputFilePath/phyloseq_output sys mv 16S_otutab_$paramForPhyloseq{'taxonomyFolderName'}.txt 16S_otu_table.txt ## add the metadata file in the phyloseq folder sys cp $path/metadata.csv $outputFilePath/phyloseq_output } string test2 = "aligned_otus.fasta.raxml.bestTree" if ( test2.exists() == 'true' ) { task{ sys cp aligned_otus.fasta.raxml.bestTree $outputFilePath/phyloseq_output sys cd $outputFilePath/phyloseq_output sys mv aligned_otus.fasta.raxml.bestTree OTUs_tree.tre } } # run the phyloseq script string phyloseqPath = outputFilePath + '/' + "phyloseq_output" phyloseqPath.chdir() sys cp $path/phyloseq_in_PEMA.R . wait sys /mnt/big/R-3.5.2/bin/Rscript phyloseq_in_PEMA.R wait } ####################################### phyloseq has been concluded ####################################################### ### make a file where i will move all the checkpoints parameterFilePath.chdir() ; if ( paramForPhyloseq{'emptyCheckpoints'} == 'Yes' ) { string checkpointsDir = 'checkpoints_for_' + paramForPhyloseq{'outputFolderName'} outputFilePath.chdir() checkpointsDir.mkdir() #parameterFilePath.chdir() path.chdir() sys mv *.chp $outputFilePath/$checkpointsDir } # keep only one OTU-table for every analysis in the case of the 16S marker gene and the alignment based taxonomy assignment method if ( paramForPhyloseq{'gene'} == '16S' && paramForPhyloseq{'taxonomyAssignmentMethod'} == 'alignment' ) { string assignmentPath = genePath + '/' + "gene_16S" + '/' + paramForPhyloseq{'taxonomyFolderName'} assignmentPath.chdir() sys cd sys mv 16S_otu_table.txt 16S_otutab_$paramForPhyloseq{'taxonomyFolderName'}.txt } outputFilePath.chdir() sys mv 4.merged_by_SPAdes 4.merged_by_PANDAseq print(" PEMA has come to an end! Let biology start! ")