-
Notifications
You must be signed in to change notification settings - Fork 3
/
main.nf
executable file
·1249 lines (1124 loc) · 46.9 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env nextflow
/*
* Notes:
*/
//A possible future option is to directly download reads from SRA
if (params.sra_download != null)
{
Channel
.fromSRA(params.sra_download)
.set{readsTuple}
}
else
{
Channel.empty().set{readsTuple}
}
process download_sra_reads {
beforeScript params.before_script_cmds
cpus 1
input:
set val(sample_id), val(readFiles) from readsTuple
output:
set file("${sample_id}_1.fq.gz"), file("${sample_id}_1.fq.gz"), file("${sample_id}_2.fq.gz") into pairedDownloadedReads
script:
"""
wget -O ${sample_id}_1.fq.gz ftp://ftp.sra.ebi.ac.uk${readFiles[0]}
wget -O ${sample_id}_2.fq.gz ftp://ftp.sra.ebi.ac.uk${readFiles[1]}
"""
}
//The below code ensures that if the workflow is rerun on a different day from the same input files, then
//it doesn't recalculate the assembly_prefix
def outFile = new File('transXpress_results/assembly_prefix.txt')
if (!outFile.exists()) {
def resultsDir = new File('transXpress_results')
if (!resultsDir.exists()) {
resultsDir.mkdirs() //Make the transXpress_results directory
}
theDate = ""
if (params.prefix_add_date == true) {
theDate = new java.util.Date().format(params.prefix_add_date_formatting) //yyMMdd by default
}
metadata = ""
if (params.prefix_add_metadata_file != "") {
theText = file(params.prefix_add_metadata_file).text
metadata = theText.replace(" ", "_").trim()
}
dateMetadataToJoin = []
if (theDate != "") {
dateMetadataToJoin.add(theDate)
}
if (metadata != "") {
dateMetadataToJoin.add(metadata)
}
dateMetadataPrefix = dateMetadataToJoin.join("_")
if (dateMetadataPrefix != "") {
dateMetadataPrefix += "_"
}
outFile.withWriter('UTF-8') {
writer ->
writer.write(dateMetadataPrefix)
}
}
//Convert into a global string variable
dateMetadataPrefix = outFile.text
log.info """
transXpress
===================================
"""+dateMetadataPrefix+" assembling with "+params.assembler
/*
* Step 0 Download reference databases
*/
//process downloadEggNOG {
// executor 'local'
// cpus 1
// storeDir params.storeDB
// beforeScript params.before_script_cmds
// output:
// file "NOG.annotations.tsv" into eggNOGDb
// script:
// """
// wget "http://eggnogdb.embl.de/download/latest/data/NOG/NOG.annotations.tsv.gz"
// gunzip NOG.annotations.tsv.gz
// """
//}
process downloadRfam {
executor 'local'
cpus 1
storeDir params.storeDB
beforeScript params.before_script_cmds
output:
set file("Rfam.cm"), file("Rfam.cm.???") into rfamDb
script:
"""
wget "ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz"
gunzip Rfam.cm.gz
cmpress Rfam.cm
"""
}
process downloadSprot {
executor 'local'
cpus 1
storeDir params.storeDB
beforeScript params.before_script_cmds
output:
set file("uniprot_sprot.fasta"), file("uniprot_sprot.fasta.p??") into sprotDb
script:
"""
wget "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
gunzip uniprot_sprot.fasta.gz
makeblastdb -in uniprot_sprot.fasta -dbtype prot
"""
}
process downloadPfam {
executor 'local'
cpus 1
storeDir params.storeDB
beforeScript params.before_script_cmds
output:
set file("Pfam-A.hmm"), file("Pfam-A.hmm.h??") into pfamDb
script:
"""
wget "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz"
gunzip Pfam-A.hmm.gz
hmmpress Pfam-A.hmm
"""
}
/*
* Step 1. De novo assembly
*/
///
/// Load read files into channels
///
process loadSamples {
executor 'local'
beforeScript params.before_script_cmds
input:
file "samples.tsv" from file(params.samples)
output:
file "samples.tsv" into toParse, toRelative
script:
"""
##do nothing. This process is just so the dag looks nicer
"""
}
//Load samples into filepair tuples.
toParse.splitCsv(sep:'\t',header:false)
.map{ row ->
println row
return tuple(file(row[2]), file(row[3])) }
.set{ readPairs_ch }
readPairs_ch.into{ trimReadPairs_ch ; fastqcReadPairs_ch }
//Once the sample files are loaded into Nextflow channels, everything should be specified relatively
process convertSamplesToRelative {
executor 'local'
beforeScript params.before_script_cmds
input:
file toRelative
output:
file "relative_samples.tsv" into relativeSamples_ch
script:
"""
#!/usr/bin/env python
import re
import os
import os.path
output_handle = open("relative_samples.tsv", "w")
with open("${toRelative}", "r") as input_handle:
for line in input_handle:
row = re.split("[\t ]+", line)
newRow = [row[0],row[1]]
forwardReads = os.path.basename(row[2]).strip()+".R1-P.qtrim.fastq.gz"
reverseReads = os.path.basename(row[3]).strip()+".R2-P.qtrim.fastq.gz"
newRow.append(forwardReads)
newRow.append(reverseReads)
newRowString = "\t".join(newRow)+os.linesep
output_handle.write(newRowString)
output_handle.close()
"""
}
relativeSamples_ch.into{ samples_file_toTrinity; relativeSamples_toTrinityFinish; relativeSamples_toKallisto; samples_file_toYAMLConvert}
process prepareTrimmomaticAdapters {
output:
path "adapters.fa" into trimmomaticAdapters
shell:
'''
seqkit seq -w 0 ${CONDA_PREFIX}/share/trimmomatic-0.39-1/adapters/*.fa > adapters.fa
'''
}
process trimmomatic {
cpus params.general_CPUs
queue params.queue_standard_nodes
time params.queue_stdtime
clusterOptions params.cluster_options
beforeScript params.before_script_cmds
input:
set file(R1_reads),file(R2_reads) from trimReadPairs_ch
file "adapters.fasta" from trimmomaticAdapters
tag {"$R1_reads"+" and " +"$R2_reads"}
output:
set file("${R1_reads}.R1-P.qtrim.fastq.gz"), file("${R2_reads}.R2-P.qtrim.fastq.gz") into filteredPairedReads_toChoice,filteredPairedReads_toKallisto
//file "*U.qtrim.fastq.gz" into filteredSingleReads
script:
"""
trimmomatic PE -threads ${task.cpus} ${R1_reads} ${R2_reads} ${R1_reads}.R1-P.qtrim.fastq.gz ${R1_reads}.R1-U.qtrim.fastq.gz ${R2_reads}.R2-P.qtrim.fastq.gz ${R2_reads}.R2-U.qtrim.fastq.gz ${params.TRIMMOMATIC_PARAMS}
"""
}
///This switch controls which assembler is executed.
filteredPairedReads_toTrinity = Channel.create()
filteredPairedReads_toRnaspades = Channel.create()
filteredPairedReads_toChoice.choice(filteredPairedReads_toTrinity,filteredPairedReads_toRnaspades) { assemblerString = params.assembler.toLowerCase().trim()
if (assemblerString == "trinity") return 0
else if (assemblerString == "rnaspades") return 1
else {
println "Error:"+params.assembler+" doesn't match an implemented assembler"
return -1 //Error status
}
}
filteredPairedReads_toTrinity.collect().into{ trinityInchwormPairedReads ; trinityFinishPairedReads }
process fastqc {
publishDir "transXpress_results/fastqc_results/", mode: "copy"
cpus params.general_CPUs
queue params.queue_standard_nodes
time params.queue_stdtime
clusterOptions params.cluster_options
beforeScript params.before_script_cmds
input:
set file(R1_reads),file(R2_reads) from fastqcReadPairs_ch
tag {"$R1_reads"+" and " +"$R2_reads"}
output:
set file("${R1_reads}.fastqc.ok"), file("${R2_reads}.fastqc.ok") into fastqcPassResults
file "*.html" into fastqcHtmlResults
script:
"""
fastqc ${R1_reads} &
fastqc ${R2_reads}
##TODO: fix this, so the pipeline doesn't execute if bad data is detected.
##Check for bad run
#cat *.html | grep -oe "\\[FAIL\\].{1,30}Per sequence quality scores" > fastqc.fail
##If no bad run, produce the .fastqc.ok files
#if [[ ! -s fastqc.fail ]]
#then
# touch ${R1_reads}.fastqc.ok
# touch ${R2_reads}.fastqc.ok
#else
# echo "Error. FastQC detected a failure in the per read sequencing quality. This possibly indicates a bad sequencing run, which would result in a poor transcriptome assembly. Please solve the quality issue in your raw read files provided in samples.tsv"
# exit 1
#fi
##Bash conditional wasn't working. Just doing a dummy:
touch ${R1_reads}.fastqc.ok
touch ${R2_reads}.fastqc.ok
sleep 15 ##Not a super important process, so might as well put in a delay to help with filesystem latency issues.
"""
}
//This produces the samples file for rnaspades
process relativeSamplesToYAML {
executor 'local'
cpus 1
beforeScript params.before_script_cmds
input:
file samples_file_toYAMLConvert
//file filteredPairedReads from filteredPairedReads_toYAML.collect() //collect flattens the tuple. This input ensures the process waits until trimmomatic is all done, and also allows for assertions as a sanity check
output:
file "samples_trimmed.yaml" into yaml_rnaSPAdes_ch
script:
"""
#!/usr/bin/env python
import re
import os
import os.path
import pprint
sample_list = []
with open("${samples_file_toYAMLConvert}", "r") as input_handle:
for line in input_handle:
row = re.split("[\t ]+", line)
if (len(row) > 3): # paired reads
paired_dict = {}
paired_dict['orientation'] = 'fr'
paired_dict['type'] = 'paired-end'
f_reads = row[2].strip()
r_reads = row[3].strip()
#assert os.path.isfile(f_reads)
#assert os.path.isfile(r_reads)
paired_dict['left reads'] = [f_reads]
paired_dict['right reads'] = [r_reads]
##Depends on where the unpaired reads are written to
##Presumably could get from sample txt as well
##Note: Commented out, to make it consistent with Trinity. May change in future
##unpaired_reads_f = f[:-16]+"U.qtrim.fastq.gz"
##unpaired_reads_r = r[:-16]+"U.qtrim.fastq.gz"
##assert os.path.isfile(unpaired_reads_f)
##assert os.path.isfile(unpaired_reads_r)
##unpaired_reads = [unpaired_reads_f] + [unpaired_reads_r]
##if len(unpaired_reads) > 0:
## sample_dict['single reads'] = unpaired_reads
sample_list.append(paired_dict)
if (len(row) == 3): # unpaired reads
unpaired_dict = {}
unpaired_dict['type'] = 'single'
u_reads = row[2].strip()
assert os.path.isfile(u_reads)
unpaired_dict['single reads'] = [u_reads]
sample_list.append(unpaired_dict)
with open("samples_trimmed.yaml", "w") as output_handle:
output_handle.write(pprint.pformat(sample_list))
"""
}
process trinityInchwormChrysalis {
//See here for an alternative approach:
//https://github.com/biocorecrg/transcriptome_assembly/blob/564f6af2e4db9625ae9de6884a6524b4ec57cece/denovo_assembly/denovo_assembly.nf#L157
cache 'lenient'
cpus params.assembly_CPUs
memory params.assembly_MEM+" GB"
queue params.queue_highmemory_nodes
time params.queue_longtime
clusterOptions params.cluster_options
beforeScript params.before_script_cmds
tag { dateMetadataPrefix+"Trinity" }
afterScript 'echo \"(Above completion message is from Trinity. transXpress will continue the pipeline execution.)\"'
//afterScript 'exit(1)'
input:
file trinityInchwormPairedReads //This flattens the tuple
file samples_file from samples_file_toTrinity
output:
file "trinity_out_dir/[!Tcr]*" into trinityWorkDirRootFiles_ch1, trinityWorkDirRootFiles_ch2 //Not files starting with c or r, so not chrysalis, read_partitions, recursive trinity cmds, Trinity.timing
file "trinity_out_dir/chrysalis/*" into trinityWorkDirChrysalisFiles_ch1, trinityWorkDirChrysalisFiles_ch2
file "trinity_out_dir/read_partitions/Fb*/C*/*.trinity.reads.fa" into trinityPhase1ReadPartitionsFiles_ch
//
script:
"""
Trinity --no_distributed_trinity_exec --max_memory ${task.memory.toGiga()}G --CPU ${task.cpus} --samples_file ${samples_file} ${params.TRINITY_PARAMS}
sleep 5 ##Try to prevent filesystem latency errors
#chmod -R a-w ./trinity_out_dir ##<- make the results read only, to troubleshoot Trinity processes writing into directories they shouldn't.
"""
}
//This is a workaround, of sorts, to parallelize the Trinity butterfly steps
//As Trinity usually expects to be writing into a single directory over time
//which doesn't play nice with Nextflow, have to do some tricks
//to recreate the structure of the Trinity work directory
//so each process ends up with a symbolic linked copy of the Trinity work directory
//and can write real files in the proper directory structure.
//But that directory structure can't be remade by Nextflow using Queue Channels, that I am aware of
//So the below mapping trick makes the directory structure into variables
//that the "trinityFinish" process can use to recreate the structure
trinityPhase1ReadPartitionsFiles_ch.flatten().map{ file ->
def filePath = file.toAbsolutePath().toString()
dir1Match = filePath =~ /Fb_[0-9]+/ ///Regex matching
dir2Match = filePath =~ /CBin_[0-9]+/ ///Regex matching
dir1String = dir1Match[0]
dir2String = dir2Match[0]
return tuple( [dir1String,dir2String] ,file)
}
.groupTuple()
.set{ partitionedReadGroups_ch }
//Add .groupTuple() to execute in groups by the directories
process trinityButterflyParallel {
//TODO This process has hardcoded parameters. It should really be getting them from the TRINITY params...
//See here for an alternative approach to this node:
//https://github.com/biocorecrg/transcriptome_assembly/blob/564f6af2e4db9625ae9de6884a6524b4ec57cece/denovo_assembly/denovo_assembly.nf#L183
cache 'lenient'
maxForks params.max_forks
cpus params.general_CPUs
queue params.queue_standard_nodes
time params.queue_stdtime
clusterOptions params.cluster_options
beforeScript params.before_script_cmds
input:
file "trinity_out_dir/*" from trinityWorkDirRootFiles_ch1
file "trinity_out_dir/chrysalis/*" from trinityWorkDirChrysalisFiles_ch1
set dir,file(fastaFiles) from partitionedReadGroups_ch
output:
file "commands.completed" into trinityFinishedCmds
file "commands.txt" into trinityCmds
file "trinity_out_dir/read_partitions/*/*/*out.Trinity.fasta" into butterflyTrinityFiles
tag { dateMetadataPrefix+"Trinity-"+dir[0]+"/"+dir[1]}
script:
"""
##Have to recreate the directory structure for the read_partition files
mkdir "trinity_out_dir/read_partitions"
mkdir "trinity_out_dir/read_partitions/${dir[0]}"
mkdir "trinity_out_dir/read_partitions/${dir[0]}/${dir[1]}"
for f in ./*.fa
do
##Once the directory structure is made, link the FASTA file back into it.
##Note the dollar sign escaping for nextflow
ln -s "../../../../\$f" "trinity_out_dir/read_partitions/${dir[0]}/${dir[1]}/\$f"
done
for f in ./trinity_out_dir/read_partitions/${dir[0]}/${dir[1]}/*.fa
do
##Make our own Trinity commands, using relative paths
echo "Trinity --single '\$f' --output '\$f.out' --CPU 1 --max_memory 1G --run_as_paired --seqType fa --trinity_complete --full_cleanup --no_distributed_trinity_exec --min_glue 2 --min_kmer_cov 2" >> commands.txt
done
##Execute in parallel
parallel --jobs ${task.cpus} < commands.txt
cp commands.txt commands.completed
#chmod -R a-w ./trinity_out_dir
sleep 15 ##Try to prevent filesystem latency errors
"""
}
process trinityFinish {
cache "lenient"
executor "local"
publishDir "transXpress_results", mode: "copy", saveAs: { filename -> filename.replaceAll("trinity_out_dir/Trinity", "transcriptome") }
beforeScript params.before_script_cmds
input:
file "trinity_out_dir/*" from trinityWorkDirRootFiles_ch2 //An attempt to relativize the butterfly processes
file "trinity_out_dir/chrysalis/*" from trinityWorkDirChrysalisFiles_ch2 //An attempt to relativize the butterfly processes
file butterflyTrinityFilesCollected from butterflyTrinityFiles.collect()
file relativeSamples from relativeSamples_toTrinityFinish
file finishedCommands from trinityFinishedCmds.collectFile(name: "recursive_trinity.cmds.completed")
file trinityCmdsCollected from trinityCmds.collectFile(name: "recursive_trinity.cmds")
file trinityFinishPairedReads
output:
set val("Trinity"), file("./trinity_out_dir/Trinity.fasta.gene_trans_map"),file("./trinity_out_dir/Trinity.fasta") into trinityFinalOutput
file "./trinity_out_dir/recursive_trinity.cmds.completed"
memory "1 GB"
cpus 1
tag { dateMetadataPrefix+"Trinity" }
script:
"""
mkdir trinity_out_dir/read_partitions/
mkdir trinity_out_dir/read_partitions/Fb_0/
mkdir trinity_out_dir/read_partitions/Fb_0/CBin_0 ##This is just a dummy directory to fool Trinity
sleep 15 ##Filesystem latency errors?
ls -1L | grep ".fasta" > fasta_files.txt
while read f; do
##Link the files into a directory structure that Trinity can deal with correctly, even if it isn't 100% right
ln -s "../../../../\$f" "trinity_out_dir/read_partitions/Fb_0/CBin_0/."
done < fasta_files.txt
##Have to produce these files to trick Trinity into thinking things are done
cp ${trinityCmdsCollected} trinity_out_dir/recursive_trinity.cmds
cp ${finishedCommands} trinity_out_dir/recursive_trinity.cmds.completed
touch trinity_out_dir/recursive_trinity.cmds.ok
Trinity --samples_file ${relativeSamples} --max_memory ${task.memory.toGiga()}G ${params.TRINITY_PARAMS}
"""
}
process runSPAdes {
cpus params.assembly_CPUs
memory params.assembly_MEM+" GB"
queue params.queue_highmemory_nodes
time params.queue_longtime
clusterOptions params.cluster_options
beforeScript params.before_script_cmds
input:
file filteredPairedReads from filteredPairedReads_toRnaspades.collect()
file datasets_YAML from yaml_rnaSPAdes_ch
//file filteredSingleReads from filteredSingleReads_ch2.collect()
output:
set val("rnaSPAdes"), file("rnaSPAdes.gene_trans_map"),file(dateMetadataPrefix+"rnaSPAdes/transcripts.fasta") into rnaSPAdesFinalOutput
script:
"""
##TODO Fixed kmer? Should open it up as a parameter instead?
rnaspades.py --dataset ${datasets_YAML} ${params.STRAND_SPECIFIC_RNASPADES} -t ${task.cpus} -m ${task.memory.toGiga()} -o ${dateMetadataPrefix}rnaSPAdes --only-assembler -k 47
##Make a fake gene to transcript file:
cat "${dateMetadataPrefix}rnaSPAdes/transcripts.fasta" | grep ">" | tr -d ">" | cut -f 1 -d " " > tmp.txt
paste tmp.txt tmp.txt > rnaSPAdes.gene_trans_map
"""
}
trinityFinalOutput.mix(rnaSPAdesFinalOutput).set{ finishedAssemblies }
process renameAssembly {
executor 'local'
publishDir "transXpress_results", mode: "copy"
beforeScript params.before_script_cmds
input:
set val(assembler), file(geneTransMap), file(transcriptome_fasta) from finishedAssemblies
file "prefix.txt" from file(params.prefix_add_metadata_file) //Just a dummy input to include the file on the DAG
output:
file "${dateMetadataPrefix}${assembler}.transcripts.fasta" into transcriptomeToSplit
set assembler, file("${dateMetadataPrefix}${assembler}.transcripts.fasta") into transcriptomeToTransdecoder, transcriptomeToStats, transcriptomeToAnnotation //Also pass the assembler type along
set assembler, file("${dateMetadataPrefix}${assembler}.transcripts.fasta"), file("${assembler}_renamed.fasta.gene_trans_map") into transcriptomeGeneTransMapKallisto
tag { dateMetadataPrefix+"${assembler}" }
script:
"""
seqkit replace -p 'TRINITY_' -r '' ${transcriptome_fasta} | seqkit replace -p '^' -r '${dateMetadataPrefix}${assembler}_' > ${dateMetadataPrefix}${assembler}.transcripts.fasta
cat ${geneTransMap} | sed 's/TRINITY_//g' | sed 's/^/${dateMetadataPrefix}${assembler}_/g' | sed 's/\t/\t${dateMetadataPrefix}${assembler}_/g' > ${assembler}_renamed.fasta.gene_trans_map
"""
}
/*
* Step 3. Annotate the assembly
*/
process transdecoderLongOrfs {
publishDir "transXpress_results", mode: "copy"
queue params.queue_standard_nodes
time params.queue_stdtime
clusterOptions params.cluster_options
cpus 1
memory "5 GB"
beforeScript params.before_script_cmds
input:
set val(assemblerName),file(transcriptomeTransdecoder) from transcriptomeToTransdecoder
output:
file "${transcriptomeTransdecoder}.transdecoder_dir/*.pep" into longOrfsProteomeSplit
file "${transcriptomeTransdecoder}.transdecoder_dir/*" into transdecoderLongOrfsDirFiles
file "${transcriptomeTransdecoder}.transdecoder_dir.__checkpoints_longorfs/*" into longOrfsCheckpointsFiles
file "*.cmds" into longOrfsRootCmds
set val("${assemblerName}"), file(transcriptomeTransdecoder) into transcriptomeTransdecoderPredict
tag { dateMetadataPrefix+"${assemblerName}" }
script:
"""
TransDecoder.LongOrfs -t ${transcriptomeTransdecoder} -m 30 ##Minimum protein length = -m amino acids
#chmod -R a-w ${transcriptomeTransdecoder}.transdecoder_dir/ ##write protect the output to troubleshoot downstream accessing.
"""
}
process kallisto {
publishDir "transXpress_results", mode: "copy"
cpus params.general_CPUs
queue params.queue_standard_nodes
time params.queue_stdtime
clusterOptions params.cluster_options
beforeScript params.before_script_cmds
input:
file filteredReadsFromPairs from filteredPairedReads_toKallisto.collect() //This flattens the tuples
set val(assemblerKallisto), file(transcriptomeKallisto), file(geneTransMap) from transcriptomeGeneTransMapKallisto
file relativeSamples from relativeSamples_toKallisto
output:
file "kallisto.isoform.TPM.not_cross_norm" into rawKallistoTable
file "kallisto.isoform.TMM.EXPR.matrix" optional true into normalizedKallistoTable
file "kallisto.gene.TPM.not_cross_norm"
file "kallisto.gene.TMM.EXPR.matrix" optional true
tag { dateMetadataPrefix+"${assemblerKallisto}" }
script:
"""
export TRINITY_HOME=\$(dirname \$(readlink -f \$(which Trinity)))
echo TRINITY_HOME set to \${TRINITY_HOME}
\${TRINITY_HOME}/util/align_and_estimate_abundance.pl --transcripts ${transcriptomeKallisto} ${params.STRAND_SPECIFIC_TRINITY} --seqType fq --samples_file ${relativeSamples} --prep_reference --thread_count ${task.cpus} --est_method kallisto --gene_trans_map ${geneTransMap}
\${TRINITY_HOME}/util/abundance_estimates_to_matrix.pl --est_method kallisto --name_sample_by_basedir --gene_trans_map $geneTransMap */abundance.tsv
"""
}
// Produce a new channel that will emit the expression data file, preferrably TMM normalized.
// If normalized expression file is not available (e.g., a single sample was used), the channel
// will emit the raw TPM values.
normalizedKallistoTable
.concat( rawKallistoTable )
.first()
.into { transcriptExpression; expressionStats }
//Not working, PDF naming issue. Easier to comment out than fix.
//process trinityStats {
// executor 'local'
// publishDir "transXpress_results", mode: "copy"
// cpus 1
// beforeScript params.before_script_cmds
// input:
// set val(assemblerStats), file(transcriptomeStats) from transcriptomeToStats
// file expressionStats
// output:
// file "transcriptome_stats.txt"
// file "transcriptome_exN50.plot.pdf"
// tag { dateMetadataPrefix+"${assemblerStats}" }
// script:
// """
// export TRINITY_HOME=\$(dirname \$(readlink -f \$(which Trinity)))
// echo TRINITY_HOME set to \${TRINITY_HOME}
// \${TRINITY_HOME}/util/TrinityStats.pl ${transcriptomeStats} > transcriptome_stats.txt
// \${TRINITY_HOME}/util/misc/contig_ExN50_statistic.pl ${expressionStats} ${transcriptomeStats} > transcriptome_exN50
// \${TRINITY_HOME}/util/misc/plot_ExN50_statistic.Rscript transcriptome_exN50
// """
//}
transcriptomeToSplit
.splitFasta(by: 100, file: true)
.into { sprotBlastxChunks; rfamChunks }
longOrfsProteomeSplit
.splitFasta(by: 100, file: true)
.into { sprotBlastpChunks; transdecoderPfamChunks }
process sprotBlastxParallel {
maxForks params.max_forks
cpus params.general_CPUs
queue { task.attempt > 1 ? params.queue_standard_nodes : params.queue_shorttime_nodes }
time { task.attempt > 1 ? params.queue_stdtime : params.queue_shorttime }
clusterOptions params.cluster_options
beforeScript params.before_script_cmds
input:
file chunk from sprotBlastxChunks
set sprotDb, sprotDbIndex from sprotDb
output:
file "blastx_out" into sprotBlastxResults
tag { chunk }
script:
"""
echo blastx ${chunk} using database ${sprotDb}
blastx -query ${chunk} -db ${sprotDb} -num_threads ${task.cpus} -evalue 1e-6 -max_hsps 1 -max_target_seqs 1 -outfmt "6 std stitle" -out blastx_out
"""
}
process sprotBlastpParallel {
maxForks params.max_forks
cpus params.general_CPUs
queue { task.attempt > 1 ? params.queue_standard_nodes : params.queue_shorttime_nodes }
time { task.attempt > 1 ? params.queue_stdtime : params.queue_shorttime }
clusterOptions params.cluster_options
beforeScript params.before_script_cmds
input:
file chunk from sprotBlastpChunks
set sprotDb, sprotDbIndex from sprotDb
output:
file "blastp_out" into sprotBlastpResults
tag { dateMetadataPrefix+"-"+chunk }
script:
"""
echo blastp ${chunk} using database ${sprotDb}
blastp -query ${chunk} -db ${sprotDb} -num_threads ${task.cpus} -evalue 1e-6 -max_hsps 1 -max_target_seqs 1 -outfmt "6 std stitle" -out blastp_out
"""
}
sprotBlastxResults.collectFile(name: 'blastx_annotations.tsv').set { blastxResult }
sprotBlastpResults.collectFile(name: 'blastp_annotations.tsv').into { blastpForTransdecoder; blastpResult }
process pfamParallel {
maxForks params.max_forks
cpus params.general_CPUs
queue { task.attempt > 1 ? params.queue_standard_nodes : params.queue_shorttime_nodes }
time { task.attempt > 1 ? params.queue_stdtime : params.queue_shorttime }
clusterOptions params.cluster_options
beforeScript params.before_script_cmds
input:
file chunk from transdecoderPfamChunks
set pfamDb, pfamDbIndex from pfamDb
output:
file "pfam_out" into pfamResults
file "pfam_dom_out" into pfamDomResults
set file("${chunk}"),file("${"pfam_dom_out"}") into revisePfamChunks //Pass through of the "longorf" peptide files
tag { dateMetadataPrefix+chunk }
script:
"""
echo pfam ${chunk} using database ${pfamDb}
hmmscan --cpu ${task.cpus} --domtblout pfam_dom_out --tblout pfam_out ${pfamDb} ${chunk}
"""
}
pfamResults.collectFile(name: 'pfam_annotations.txt').set { pfamResult }
pfamDomResults.collectFile(name: 'pfam_dom_annotations.txt').into { pfamDomResult ; pfamForTransdecoder ; pfamToGff3Doms}
process rfamParallel {
maxForks params.max_forks
cpus params.general_CPUs
queue { task.attempt > 1 ? params.queue_standard_nodes : params.queue_shorttime_nodes }
time { task.attempt > 1 ? params.queue_stdtime : params.queue_shorttime }
clusterOptions params.cluster_options
beforeScript params.before_script_cmds
input:
file chunk from rfamChunks
set rfamDb, rfamDbIndex from rfamDb
output:
file "rfam_out" into rfamResults
//file "rfam_dom_out" into rfamDomResults
tag { chunk }
script:
"""
echo rfam ${chunk} using database ${rfamDb}
cmscan -E 0.00001 --incE 0.00001 --rfam --cpu ${task.cpus} --tblout rfam_out ${rfamDb} ${chunk}
"""
}
process publishRfamResults {
executor 'local'
publishDir "transXpress_results", mode: "copy"
beforeScript params.before_script_cmds
cpus 1
input:
file rfamResult from rfamResults.collectFile(name: 'rfam_annotations_unsorted.txt')
output:
file "rfam_annotations.txt" into rfamResultPub
tag { dateMetadataPrefix }
script:
"""
cat ${rfamResult} | head -n 2 > header.txt
cat ${rfamResult} | tail -n 9 > footer.txt
cat header.txt ${rfamResult} footer.txt | grep -v "#" | sort -k3,3 -k15nr,15 | sort -u -k3,3 --merge | sort -k15nr,15 > rfam_annotations.txt
"""
}
process transdecoderPredict {
publishDir "transXpress_results", mode: "copy" // , saveAs: { filename -> "transcriptome_after_predict.pep" }
cpus 1
queue params.queue_standard_nodes
time { task.attempt > 1 ? params.queue_stdtime : params.queue_shorttime }
clusterOptions params.cluster_options
beforeScript params.before_script_cmds
input:
set val(assembler),file(transcriptomeFile) from transcriptomeTransdecoderPredict
file "${transcriptomeFile}.transdecoder_dir/*" from transdecoderLongOrfsDirFiles
file "${transcriptomeFile}.transdecoder_dir.__checkpoints_longorfs/*" from longOrfsCheckpointsFiles
file longOrfsRootCmds
file blastpForTransdecoder
file pfamForTransdecoder
output:
file "${transcriptomeFile}.transdecoder.pep" into predictProteome, predictProteomeSplitBy100
file "${transcriptomeFile}.transdecoder.bed"
file "${transcriptomeFile}.transdecoder.cds"
file "${transcriptomeFile}.transdecoder.gff3" into transdecoderGFF3ToPfam, transdecoderGFF3ToLiftover
tag { dateMetadataPrefix+"${assembler}" }
script:
"""
TransDecoder.Predict -t ${transcriptomeFile} --retain_pfam_hits ${pfamForTransdecoder} --retain_blastp_hits ${blastpForTransdecoder}
"""
}
predictProteome.into{ proteomeToAnnotation ; proteomeToPfamRevise }
predictProteomeSplitBy100
.splitFasta(by: 100, file: true)
.into{ tmhmmChunks ; deeplocChunks ; signalpChunks }
signalpChunks.into{ signalp4Chunks ; signalp5Chunks}
proteomeToPfamRevise.combine(revisePfamChunks).set{ combinedToPfamRevise }
process revisePfamResults {
executor 'local'
cpus 1
beforeScript params.before_script_cmds
input:
set file(proteome),file(longOrfChunk),file(hmmerscanDomFile) from combinedToPfamRevise
output:
file "test.txt"
tag { dateMetadataPrefix+"${longOrfChunk}" }
script:
"""
seqkit fx2tab -n -i ${longOrfChunk} | cut -f 1 > ids.txt
seqkit grep -f ids.txt ${proteome} > post-predict_subset.fasta
seqkit grep -s -f <(seqkit seq -s ${longOrfChunk}) post-predict_subset.fasta > common.fasta ##Find all the fasta records where the sequences match exactly.
seqkit grep -v -s -f <(seqkit seq -s ${longOrfChunk}) post-predict_subset.fasta > differing.fasta ##Find all the fasta records where the sequences differ.
predictLen=`cat post-predict_subset.fasta | grep ">" | wc -l`
commonLen=`cat common.fasta | grep ">" | wc -l`
echo "\$predictLen proteins were analyzed & \$commonLen had an exact sequence match"
if [ \$predictLen -eq \$commonLen ]
then
echo "no changes between the longorfs and transdecoder predict versions"
touch test.txt
else
##This does get triggered with the test datasets, at least with the short peptides.
echo "changes detected"
touch test.txt
fi
"""
}
process pfamToGff3 {
publishDir "transXpress_results", mode: "copy"
executor 'local'
cpus 1
beforeScript params.before_script_cmds
input:
file pfamDomResult from pfamToGff3Doms
file refGFF3 from transdecoderGFF3ToPfam
output:
file "pfam_domains.gff3" //TODO hook this up to
tag { "$refGFF3" }
script:
"""
##Using wrf's pfam2gff script.
##Disabled for now until the dependency situation is figured out.
##python ../../../pfam2gff.py -g ${refGFF3} -i ${pfamDomResult} -T > pfam_domains.gff3
touch pfam_domains.gff3
"""
}
process signalp4Parallel {
cpus 1
maxForks params.max_forks
queue { task.attempt > 1 ? params.queue_standard_nodes : params.queue_shorttime_nodes }
time { task.attempt > 1 ? params.queue_stdtime : params.queue_shorttime }
clusterOptions params.cluster_options
beforeScript params.before_script_cmds
input:
file chunk from signalp4Chunks
output:
file "signalp_out" into signalp4Results
tag { chunk }
script:
"""
if [ -f "/usr/local/bin/signalp" ]; then
signalp -t ${params.SIGNALP_ORGANISMS} -f short ${chunk} > signalp_out
else
echo "Unable to find signalP 4, so making dummy file instead"
touch signalp_out
fi
"""
}
process signalp5Parallel {
cpus 1
maxForks params.max_forks
queue { task.attempt > 1 ? params.queue_standard_nodes : params.queue_shorttime_nodes }
time { task.attempt > 1 ? params.queue_stdtime : params.queue_shorttime }
clusterOptions params.cluster_options
beforeScript params.before_script_cmds
input:
file chunk from signalp5Chunks
output:
file "*signalp5" into signalp5Results
file "*.gff3" into signalp5ResultsGff3
tag { chunk }
script:
"""
##Weng lab specific stuff for testing
if [ -f "/lab/solexa_weng/testtube/signalp-5.0/bin/signalp" ]; then
/lab/solexa_weng/testtube/signalp-5.0/bin/signalp -prefix "signalp5_"${chunk} -org ${params.SIGNALP_ORGANISMS} -format short -fasta ${chunk} -gff3
else
echo "Unable to find signalP 5, so making dummy files instead"
touch blank.gff3
touch blank.signalp5
fi
"""
}
signalp5ResultsGff3.collectFile(name: 'signalp5_annotations.gff3').into{ signalp5ResultGff3ToAnnotate ; signalp5ResultGff3ToLiftover}
process deeplocParallel {
maxForks params.max_forks
queue { task.attempt > 1 ? params.queue_standard_nodes : params.queue_shorttime_nodes }
time { task.attempt > 1 ? params.queue_stdtime : params.queue_shorttime }
cpus 1
clusterOptions params.cluster_options
beforeScript params.before_script_cmds
input:
file chunk from deeplocChunks
output:
file "${chunk}.out.txt" into deeplocResults
tag { chunk }
script:
"""
if hash deeploc 2>/dev/null;
then
export MKL_THREADING_LAYER=GNU
export PATH="/lab/solexa_weng/testtube/miniconda3/bin:$PATH"
deeploc -f ${chunk} -o ${chunk}.out
else
echo "Unable to find deeploc, so making dummy files instead"
touch ${chunk}.out.txt
fi
"""
}
tmhmmChunks.into{tmhmmChunks_ch1; tmhmmChunks_ch2}
//TODO: Update this to tmhmm.py, which is on conda?
process tmhmmParallel {
cpus 1
maxForks params.max_forks
queue { task.attempt > 1 ? params.queue_standard_nodes : params.queue_shorttime_nodes }
time { task.attempt > 1 ? params.queue_stdtime : params.queue_shorttime }
clusterOptions params.cluster_options
beforeScript params.before_script_cmds
input:
file chunk from tmhmmChunks_ch1
output:
file "tmhmm_out" into tmhmmResults
tag { chunk }
script:
"""
##If interested in using tmhmm.py:
##tmhmm -m \${CONDA_PREFIX}/TMHMM2.0.model -f ${chunk}
##It makes 3 files, *.annotation, *.plot, *.summary
if hash tmhmm 2>/dev/null;
then
echo tmhmm ${chunk}
##Have to be careful as dtu.dk tmhmm and tmhmm.py have a name collision.
tmhmm --short < ${chunk} > tmhmm_out
else
echo "Unable to find tmhmm, so making dummy files instead"
touch tmhmm_out
fi
"""
}
tmhmmResults.collectFile(name: 'tmhmm_annotations.tsv').into{ tmhmmResultToAnnotate ; tmhmmResultToGff3 }
//Comment out, rather than deal with this.
//process tmhmmPyParallel {
// //Recommend installing with pip, rather with Conda.
// publishDir "transXpress_results/tmhmm.py"
// maxForks params.max_forks
// cpus 1
// queue { task.attempt > 1 ? params.queue_standard_nodes : params.queue_shorttime_nodes }
// time { task.attempt > 1 ? params.queue_stdtime : params.queue_shorttime }
// clusterOptions params.cluster_options
// beforeScript params.before_script_cmds
// input:
// file chunk from tmhmmChunks_ch2
// output:
// file "./*.out" into tmhmmPyOutResults
// file "./*.plot" into tmhmmPyAnnotationResults
// file "./*.summary" into tmhmmPySummaryResults
// tag { chunk }
// script:
// """
// ##conda config --add channels dansondergaard
//
// ##If interested in using tmhmm.py:
// tmhmm -m \${CONDA_PREFIX}/TMHMM2.0.model -f ${chunk}
// ##It makes 3 files, *.annotation, *.plot, *.summary
// """
//}
process tmhmmMakeGff3 {
executor 'local'
cpus 1
beforeScript params.before_script_cmds
input:
file tmhmmResultToGff3
tag{ dateMetadataPrefix }
output:
file "*.gff3" into tmhmmGff3ToLiftover, tmhmmGff3
script:
"""
#!/usr/bin/env python
import os
import re
write_handle = open("tmhmm.pep.gff3","w")
read_handle = open("${tmhmmResultToGff3}","r")
for line in read_handle.readlines():
if line[0] == "#" or len(line.strip()) == 0:
continue
splitrow = re.split("\\t+",line)
protein_id = splitrow[0]
topology = splitrow[5]
matches = re.findall("([oi][0-9]+-[0-9]+)",topology)
for m in matches:
type = m[0]
start,end = m[1:].split("-")
outstring = "\\t".join([protein_id,"tmhmm","transmembrane_region",start,end,".",".",".","."])+os.linesep
write_handle.write(outstring)
read_handle.close()
write_handle.close()
"""