From b2e423fbab6f2e4e56948ebc0eea0c3b6cdfcfe6 Mon Sep 17 00:00:00 2001 From: Francoise Thibaud-Nissen <43827521+thibaudnis@users.noreply.github.com> Date: Tue, 23 May 2023 15:05:27 -0400 Subject: [PATCH 01/26] Update README.md Documented the addition of CheckM and ANI to the workflow in the intro. Addition of reference and license terms for CheckM. --- README.md | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fec34e0..106bbb9 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,14 @@ in 2001 and is regularly upgraded to improve structural and functional annotation quality ([Li W, O'Neill KR et al 2021](https://www.ncbi.nlm.nih.gov/pubmed/33270901)). Recent improvements include utilization of curated protein profile hidden Markov models (HMMs), and curated complex domain architectures for functional annotation of proteins and -annotation of Enzyme Commission numbers and Gene Ontology terms. +annotation of Enzyme Commission numbers and Gene Ontology terms. Post-annotation, the +completeness of the annotated gene set is estimated with +[CheckM](https://pubmed.ncbi.nlm.nih.gov/25977477/). + +The workflow provided here also offers the option to confirm or correct the organism +associated with the genome assembly prior to starting the annotation, using the +[Average Nucleotide Identity tool](https://pubmed.ncbi.nlm.nih.gov/29792589/). +. Get started by watching this [webinar](https://www.youtube.com/watch?v=pNn_-_46lpI)! @@ -56,6 +63,11 @@ Int J Syst Evol Microbiol. 2018 Jul;68(7):2386-2392. Lomsadze A, Gemayel K, Tang S, Borodovsky M.\ Genome Research. 2018; 28(7):1079-1089. +### CheckM +[CheckM: assessing the quality of microbial genomes recovered from isolates, single cells, and metagenomes](https://pubmed.ncbi.nlm.nih.gov/25977477/)\ +Parks DH, Imelfort M, Skennerton CT, Hugenholtz P, Tyson GW.\ +Genome Research. 2015; 25(7):1043-1055. + ### TIGRFAMs [TIGRFAMs: a protein family resource for the functional identification of proteins.](https://www.ncbi.nlm.nih.gov/pubmed/11125044)\ @@ -107,6 +119,12 @@ GeneMarkS-2+ is distributed as part of PGAP with limited rights of use and redistribution from the Georgia Tech Research Corporation. See the [full text of the license](GeneMarkS_Software_License.txt). +### CheckM + +GNU General Public License v3.0 + +Permissions of this strong copyleft license are conditioned on making available complete source code of licensed works and modifications, which include larger works using a licensed work, under the same license. Copyright and license notices must be preserved. Contributors provide an express grant of patent rights. See the [full text of the license](Check-M-license.txt). + ### TIGRFAMs The original TIGRFAMs database was a research project of the J. Craig From c17cac4c046f8ba2b8574a121c44a72d2e6b27e6 Mon Sep 17 00:00:00 2001 From: Francoise Thibaud-Nissen <43827521+thibaudnis@users.noreply.github.com> Date: Tue, 23 May 2023 15:57:01 -0400 Subject: [PATCH 02/26] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 106bbb9..2b94b82 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,6 @@ completeness of the annotated gene set is estimated with The workflow provided here also offers the option to confirm or correct the organism associated with the genome assembly prior to starting the annotation, using the [Average Nucleotide Identity tool](https://pubmed.ncbi.nlm.nih.gov/29792589/). -. Get started by watching this [webinar](https://www.youtube.com/watch?v=pNn_-_46lpI)! @@ -68,6 +67,7 @@ Genome Research. 2018; 28(7):1079-1089. Parks DH, Imelfort M, Skennerton CT, Hugenholtz P, Tyson GW.\ Genome Research. 2015; 25(7):1043-1055. + ### TIGRFAMs [TIGRFAMs: a protein family resource for the functional identification of proteins.](https://www.ncbi.nlm.nih.gov/pubmed/11125044)\ From a2d6cd4c53bf3501f6bd79edebb7ca30bba8456f Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Fri, 26 May 2023 10:59:41 -0400 Subject: [PATCH 03/26] moved here from SVN; JIRA: PGAPX-1148 --- .../stdvalsum2single-genome-validation.xsl | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 scripts/stdvalsum2single-genome-validation.xsl diff --git a/scripts/stdvalsum2single-genome-validation.xsl b/scripts/stdvalsum2single-genome-validation.xsl new file mode 100644 index 0000000..70ef03b --- /dev/null +++ b/scripts/stdvalsum2single-genome-validation.xsl @@ -0,0 +1,59 @@ + + + + + + + + + + + + + STATISTICS OF ASNVALIDATE DIAGNOSTICS + STATISTICS OF ASNDISC DIAGNOSTICS + + + + + + + + + asnval + asndisc + + + + + + + + + + + + + + + +
+ +
Application/levelLevel/codeCount
+ + +
+
From 1e87186a703de3ea4c06dee6867a121ebd5b1a27 Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Mon, 12 Jun 2023 08:19:31 -0400 Subject: [PATCH 04/26] pass Validate_Annotation_collect_annot_stats/output as all_proc_annot_stats output of wf_common - a top workflow for Pathogen processing; JIRA: PGAPX-1162 --- wf_common.cwl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/wf_common.cwl b/wf_common.cwl index 93ecfa0..5f332b0 100755 --- a/wf_common.cwl +++ b/wf_common.cwl @@ -1050,6 +1050,9 @@ outputs: proc_annot_stats: type: File outputSource: Validate_Annotation_proc_annot_stats/var_proc_annot_stats_xml + all_proc_annot_stats: + type: File + outputSource: Validate_Annotation_collect_annot_stats/output initial_asndisc_error_diag: type: File? outputSource: Prepare_Unannotated_Sequences_asndisc_evaluate/xml_output From ddaa4aa27076449627a06f72e05d49b34cffaaee Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Tue, 20 Jun 2023 13:12:01 -0400 Subject: [PATCH 05/26] use non-NCBI accessions in FASTA input for producing 'enhanced', ROARY-ready gff output; JIRA: PGAPX-1161 --- progs/asn2fasta.cwl | 4 ++++ wf_common.cwl | 13 ++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/progs/asn2fasta.cwl b/progs/asn2fasta.cwl index 03a48a1..daa995f 100644 --- a/progs/asn2fasta.cwl +++ b/progs/asn2fasta.cwl @@ -25,6 +25,10 @@ inputs: type: boolean? inputBinding: prefix: -prots-only + ignore_orig_id: + type: boolean? + inputBinding: + prefix: -ignore-origid fasta_name: type: string? inputBinding: diff --git a/wf_common.cwl b/wf_common.cwl index 5f332b0..f75be45 100755 --- a/wf_common.cwl +++ b/wf_common.cwl @@ -748,11 +748,22 @@ steps: nuc_fasta_name: default: annot.fna out: [nuc_fasta] + Generate_Annotation_Reports_nuc_non_NCBI_fasta: + run: progs/asn2fasta.cwl + in: + i: Final_Bacterial_Package_sqn2gbent/output + type: + default: seq-entry + nuc_fasta_name: + default: annot_non_NCBI.fna + ignore_orig_id: + default: true + out: [nuc_fasta] Generate_Annotation_Reports_gff_enhanced: run: progs/produce_enhanced_gff.cwl in: gff: Generate_Annotation_Reports_gff/output - fasta: Generate_Annotation_Reports_nuc_fasta/nuc_fasta + fasta: Generate_Annotation_Reports_nuc_non_NCBI_fasta/nuc_fasta out: [output] Generate_Annotation_Reports_prot_fasta: run: progs/asn2fasta.cwl From 288d320b2dff4908df9b5d8fc6d0ce06b6d9c786 Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Wed, 21 Jun 2023 07:16:31 -0400 Subject: [PATCH 06/26] Revert "use non-NCBI accessions in FASTA input for producing 'enhanced', ROARY-ready gff output; JIRA: PGAPX-1161" This reverts commit ddaa4aa27076449627a06f72e05d49b34cffaaee. --- progs/asn2fasta.cwl | 4 ---- wf_common.cwl | 13 +------------ 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/progs/asn2fasta.cwl b/progs/asn2fasta.cwl index daa995f..03a48a1 100644 --- a/progs/asn2fasta.cwl +++ b/progs/asn2fasta.cwl @@ -25,10 +25,6 @@ inputs: type: boolean? inputBinding: prefix: -prots-only - ignore_orig_id: - type: boolean? - inputBinding: - prefix: -ignore-origid fasta_name: type: string? inputBinding: diff --git a/wf_common.cwl b/wf_common.cwl index f75be45..5f332b0 100755 --- a/wf_common.cwl +++ b/wf_common.cwl @@ -748,22 +748,11 @@ steps: nuc_fasta_name: default: annot.fna out: [nuc_fasta] - Generate_Annotation_Reports_nuc_non_NCBI_fasta: - run: progs/asn2fasta.cwl - in: - i: Final_Bacterial_Package_sqn2gbent/output - type: - default: seq-entry - nuc_fasta_name: - default: annot_non_NCBI.fna - ignore_orig_id: - default: true - out: [nuc_fasta] Generate_Annotation_Reports_gff_enhanced: run: progs/produce_enhanced_gff.cwl in: gff: Generate_Annotation_Reports_gff/output - fasta: Generate_Annotation_Reports_nuc_non_NCBI_fasta/nuc_fasta + fasta: Generate_Annotation_Reports_nuc_fasta/nuc_fasta out: [output] Generate_Annotation_Reports_prot_fasta: run: progs/asn2fasta.cwl From aa4f7ddf823b24ad83c5f6854ef1b2ff64f5fb59 Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Wed, 21 Jun 2023 12:06:18 -0400 Subject: [PATCH 07/26] reduce number of hashes before FASTA from three to two, because this is the standard; JIRA: PGAPX-1146. Both work though in roary --- progs/produce_enhanced_gff.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/progs/produce_enhanced_gff.cwl b/progs/produce_enhanced_gff.cwl index 311628e..136c370 100644 --- a/progs/produce_enhanced_gff.cwl +++ b/progs/produce_enhanced_gff.cwl @@ -17,7 +17,7 @@ inputs: position: 1 separator: type: string? - default: '### FASTA' + default: '## FASTA' separator_file: type: string? default: 'separator.txt' From da7870966f183a2b8cb04e90a96a07b1203e2dc5 Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Fri, 23 Jun 2023 07:06:17 -0400 Subject: [PATCH 08/26] move enhanced, Roary-capable, GFF constructions from wf_common.cwl to pgap.cwl; JIRA: PGAPX-1161 --- pgap.cwl | 11 +++++++++-- wf_common.cwl | 9 --------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/pgap.cwl b/pgap.cwl index ef60064..4232467 100755 --- a/pgap.cwl +++ b/pgap.cwl @@ -47,7 +47,7 @@ outputs: outputSource: standard_pgap/gff type: File gff_enhanced: - outputSource: standard_pgap/gff_enhanced + outputSource: Generate_Annotation_Reports_gff_enhanced/output type: File sqn: outputSource: standard_pgap/sqn @@ -184,5 +184,12 @@ steps: no_internet: no_internet make_uuid: make_uuid uuid_in: uuid_in - out: [gbent, gbk, gff, gff_enhanced, nucleotide_fasta, protein_fasta, cds_nucleotide_fasta, cds_protein_fasta, sqn, initial_asndisc_error_diag, initial_asnval_error_diag, final_asndisc_error_diag, final_asnval_error_diag, checkm_raw, checkm_results] + out: [gbent, gbk, gff, nucleotide_fasta, protein_fasta, cds_nucleotide_fasta, cds_protein_fasta, sqn, initial_asndisc_error_diag, initial_asnval_error_diag, final_asndisc_error_diag, final_asnval_error_diag, checkm_raw, checkm_results] run: wf_common.cwl + Generate_Annotation_Reports_gff_enhanced: + run: progs/produce_enhanced_gff.cwl + in: + gff: standard_pgap/gff + fasta: fasta + out: [output] + diff --git a/wf_common.cwl b/wf_common.cwl index 5f332b0..3832b59 100755 --- a/wf_common.cwl +++ b/wf_common.cwl @@ -748,12 +748,6 @@ steps: nuc_fasta_name: default: annot.fna out: [nuc_fasta] - Generate_Annotation_Reports_gff_enhanced: - run: progs/produce_enhanced_gff.cwl - in: - gff: Generate_Annotation_Reports_gff/output - fasta: Generate_Annotation_Reports_nuc_fasta/nuc_fasta - out: [output] Generate_Annotation_Reports_prot_fasta: run: progs/asn2fasta.cwl in: @@ -1026,9 +1020,6 @@ outputs: gff: type: File outputSource: Generate_Annotation_Reports_gff/output - gff_enhanced: - type: File - outputSource: Generate_Annotation_Reports_gff_enhanced/output gbk: type: File outputSource: Generate_Annotation_Reports_gbk/output From f347c7a0ff86304b85d6df4a659c7d6f05aa6615 Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Mon, 26 Jun 2023 07:44:09 -0400 Subject: [PATCH 09/26] remove empty files unconditionally; JIRA: PGAPX-1144 --- scripts/pgap.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/pgap.py b/scripts/pgap.py index ea62eaf..e56965f 100755 --- a/scripts/pgap.py +++ b/scripts/pgap.py @@ -1011,7 +1011,6 @@ def main(): sys.exit(1) else: print("Ignoring") - remove_empty_files(args.output) if not args.ani_only: p = Pipeline(params, args.input, "pgap") @@ -1024,7 +1023,7 @@ def main(): submol_modified = os.path.join(args.output, p.submol) if os.path.exists(submol_modified): os.remove(submol_modified) - remove_empty_files(args.output) + remove_empty_files(args.output) except (Exception, KeyboardInterrupt) as exc: if args.debug: From a72b4e603694c0318ba02c1274e51f9891296dad Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Mon, 26 Jun 2023 09:56:00 -0400 Subject: [PATCH 10/26] replaced all references to args.output to params.outputdir; args.output is requested output directory, if it exists, new, numbered, directory will be craeted and params.outputdir no longer is the same as args.output; the situation is exacerbated by the fact that args.output exists from the previous run, so we had a potential of creating files from current run in previous run. Now this is corrected; JIRA: PGAPX-1144 --- scripts/pgap.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/scripts/pgap.py b/scripts/pgap.py index e56965f..dc3fd59 100755 --- a/scripts/pgap.py +++ b/scripts/pgap.py @@ -506,7 +506,7 @@ def launch(self): {"file": "initial_asndisc_diag.xml", "remove": True}, {"file": "initial_asnval_diag.xml", "remove": True} ] - self.report_output_files(self.params.args.output, output_files) + self.report_output_files(self.params.outputdir, output_files) return proc.returncode class Setup: @@ -976,14 +976,18 @@ def main(): # args.output for some reason not always available time.sleep(1) # analyze ani output here - if not os.path.exists(args.output): - print("INTERNAL(SYSTEM)PROBLEM: abort: output directory does not exist: {}".format(args.output)) + print (f"DEBUG: args.output = {args.output}") + print (f"DEBUG: params.outputdir = {params.outputdir}") + outputdir = args.output # this does not work + outputdir = params.outputdir + if not os.path.exists(outputdir): + print("INTERNAL(SYSTEM)PROBLEM: abort: output directory does not exist: {}".format(outputdir)) if args.ignore_all_errors == False: sys.exit(1) else: print("Ignoring") - params.ani_output = os.path.join(args.output, "ani-tax-report.xml") - params.ani_hr_output = os.path.join(args.output, "ani-tax-report.txt") + params.ani_output = os.path.join(outputdir, "ani-tax-report.xml") + params.ani_hr_output = os.path.join(outputdir, "ani-tax-report.txt") if os.path.exists(params.ani_output) and os.path.getsize(params.ani_output) > 0: True else: @@ -993,7 +997,7 @@ def main(): else: params.ani_hr_output = None - errors_xml_fn = os.path.join(args.output, "errors.xml") + errors_xml_fn = os.path.join(outputdir, "errors.xml") # if there are errors # and we do not want to recover them when it is recoverable # then bail @@ -1017,13 +1021,13 @@ def main(): retcode = p.launch() p.cleanup() if retcode == 0: - for errors_xml_fn in glob.glob(os.path.join(args.output, "errors.xml")): + for errors_xml_fn in glob.glob(os.path.join(outputdir, "errors.xml")): os.remove(errors_xml_fn) if(p.submol != None): - submol_modified = os.path.join(args.output, p.submol) + submol_modified = os.path.join(outputdir, p.submol) if os.path.exists(submol_modified): os.remove(submol_modified) - remove_empty_files(args.output) + remove_empty_files(outputdir) except (Exception, KeyboardInterrupt) as exc: if args.debug: From 12006824f205dc40abfd81302a41edb1c0ff213f Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Tue, 27 Jun 2023 05:39:05 -0400 Subject: [PATCH 11/26] added missing initialization for outputdir in one function.JIRA: PGAPX-1161 --- scripts/pgap.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/pgap.py b/scripts/pgap.py index dc3fd59..7ed5ddb 100755 --- a/scripts/pgap.py +++ b/scripts/pgap.py @@ -1020,6 +1020,7 @@ def main(): p = Pipeline(params, args.input, "pgap") retcode = p.launch() p.cleanup() + outputdir = p.params.outputdir if retcode == 0: for errors_xml_fn in glob.glob(os.path.join(outputdir, "errors.xml")): os.remove(errors_xml_fn) From d3d4f1364d2e95b49869e2cb40e4722eb4b7d5b3 Mon Sep 17 00:00:00 2001 From: George Coulouris Date: Fri, 11 Aug 2023 13:49:36 -0400 Subject: [PATCH 12/26] JIRA PGAPX-1167 simplify output directory handling --- scripts/pgap.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/scripts/pgap.py b/scripts/pgap.py index 7ed5ddb..59ee817 100755 --- a/scripts/pgap.py +++ b/scripts/pgap.py @@ -642,15 +642,9 @@ def get_use_version(self): def get_output_dir(self): outputdir = os.path.abspath(self.args.output) if os.path.exists(outputdir): - parent, base = os.path.split(outputdir) - counter = 0 - for sibling in os.listdir(parent): - if sibling.startswith(base + '.'): - ext = sibling[len(base)+1:] - if ext.isdecimal(): - counter = max(counter, int(ext)) - outputdir = os.path.join(parent, base+'.'+str(counter+1)) - return outputdir + sys.exit(f"Output directory {outputdir} exists, exiting.") + else: + return outputdir def get_docker_info(self): docker_type_alternatives = ['docker', 'podman', 'singularity', 'apptainer'] From b435014cd9e0c57bbefd7056ca4ad509aacef049 Mon Sep 17 00:00:00 2001 From: George Coulouris Date: Fri, 11 Aug 2023 16:16:53 -0400 Subject: [PATCH 13/26] JIRA PGAPX-1168 suppress help for branch option --- scripts/pgap.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/scripts/pgap.py b/scripts/pgap.py index 59ee817..f7aeffd 100755 --- a/scripts/pgap.py +++ b/scripts/pgap.py @@ -889,11 +889,10 @@ def main(): help='Print currently set up PGAP version') parser.add_argument('-v', '--verbose', action='store_true', help='Verbose mode') - - version_group = parser.add_mutually_exclusive_group() - version_group.add_argument('--dev', action='store_true', help=argparse.SUPPRESS) # help="Set development mode") - version_group.add_argument('--test', action='store_true', help=argparse.SUPPRESS) # help="Set test mode") - version_group.add_argument('--prod', action='store_true', help="Use a production candidate version. For internal testing.") + + parser.add_argument('--dev', action='store_true', help=argparse.SUPPRESS) # help="Set development mode") + parser.add_argument('--test', action='store_true', help=argparse.SUPPRESS) # help="Set test mode") + parser.add_argument('--prod', action='store_true', help=argparse.SUPPRESS) # help="Use a production candidate version. For internal testing." ani_group = parser.add_mutually_exclusive_group() ani_group.add_argument('--taxcheck', dest='ani', action='store_true', help="Also calculate the Average Nucleotide Identity") From e2db4df99f7134bb072f41f0a3d8d6ce329e8c24 Mon Sep 17 00:00:00 2001 From: George Coulouris Date: Fri, 11 Aug 2023 16:26:19 -0400 Subject: [PATCH 14/26] JIRA PGAPX-1166 ensure that exactly one of -r or -n is specified --- scripts/pgap.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/pgap.py b/scripts/pgap.py index f7aeffd..075094f 100755 --- a/scripts/pgap.py +++ b/scripts/pgap.py @@ -948,6 +948,9 @@ def main(): args = parser.parse_args() + if ( (not args.report_usage_true) and (not args.report_usage_false) ): + parser.error("One of -n/--report-usage-false or -r/--report-usage-true must be provided.") + # Check for the different no_yaml_group arguments scenarios. if (args.genome and not args.organism) or (not args.genome and args.organism): parser.error("Invalid Command Line Argument Error: Both arguments -s\--organism and -g\--genome must be provided if no YAML file is provided.") From 2ecfcc85acd05ef02479dc3a3f8cf001470ba397 Mon Sep 17 00:00:00 2001 From: George Coulouris Date: Tue, 15 Aug 2023 13:55:33 -0400 Subject: [PATCH 15/26] JIRA PGAPX-1166 correction to previous commit --- scripts/pgap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/pgap.py b/scripts/pgap.py index 075094f..194eb14 100755 --- a/scripts/pgap.py +++ b/scripts/pgap.py @@ -948,7 +948,7 @@ def main(): args = parser.parse_args() - if ( (not args.report_usage_true) and (not args.report_usage_false) ): + if ( (args.input or args.genome) and (not args.report_usage_true) and (not args.report_usage_false) ): parser.error("One of -n/--report-usage-false or -r/--report-usage-true must be provided.") # Check for the different no_yaml_group arguments scenarios. From 6469dd8af5489e8b9da92bdac053ecd6659ee7ae Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Thu, 24 Aug 2023 15:18:09 -0400 Subject: [PATCH 16/26] remove .gz suffix in gpx_make_outputs -output it is now added automatically; JIRA: PGAPX-1180 --- amr_naming/wf_amr_naming.cwl | 136 +++++++++++++++++++++++ expr/supplemental_data_split_dir.cwl | 5 + progs/amr_finder_plus.cwl | 45 ++++++++ progs/amr_plus_adaptor.cwl | 30 +++++ progs/asn_adjust.cwl | 26 +++++ progs/map_amr_ids.cwl | 51 +++++++++ task_types/tt_amr_finder_plus.cwl | 53 +++++++++ task_types/tt_kmer_compare_wnode.cwl | 2 +- task_types/tt_kmer_ref_compare_wnode.cwl | 2 +- wf_common.cwl | 16 ++- 10 files changed, 363 insertions(+), 3 deletions(-) create mode 100755 amr_naming/wf_amr_naming.cwl create mode 100644 progs/amr_finder_plus.cwl create mode 100644 progs/amr_plus_adaptor.cwl create mode 100755 progs/asn_adjust.cwl create mode 100644 progs/map_amr_ids.cwl create mode 100644 task_types/tt_amr_finder_plus.cwl diff --git a/amr_naming/wf_amr_naming.cwl b/amr_naming/wf_amr_naming.cwl new file mode 100755 index 0000000..14c9cd4 --- /dev/null +++ b/amr_naming/wf_amr_naming.cwl @@ -0,0 +1,136 @@ +#!/usr/bin/env cwl-runner +label: "Naming AMR Genes Plane" +cwlVersion: v1.2 +class: Workflow +requirements: + - class: SubworkflowFeatureRequirement + - class: MultipleInputFeatureRequirement + +inputs: + annotation: + type: File + taxon_db: + type: File + taxid: + type: int + database: + type: Directory + tax_group_name: + type: string +steps: + Prepare_AMR_Annotation_Input: + label: "Prepare AMR Annotation Input" + run: ../progs/asn_translator.cwl + in: + input: annotation + output_output: {default: 'annotation.asn'} + out: [output] + Remove_Extraneous_Protein_Ids: + label: "Remove Extraneous Protein Ids" + run: ../progs/asn_adjust.cwl + # tasktype adjust_entries + # action node class: CAdjustActionNode + # action node an_adjust_node + # action node complication: pass input if prog generates no output + # asn_adjust -input-manifest inp/entries.mft -output-path out -fix-prots-to-gnl-id -t + # creates translated.asn if creates + in: + input: Prepare_AMR_Annotation_Input/output + out: [entries] + Get_Fasta: + label: "Get Fasta" + run: ../progs/asn2fasta.cwl + in: + i: Remove_Extraneous_Protein_Ids/entries + type: + default: seq-entry + prot_fasta_name: + default: proteins.fa + nuc_fasta_name: + default: nucs.fa + + out: [nuc_fasta, prot_fasta] + # tasktype get_fasta_from_asn + # action node class: CAsn2FastaActionNode + # action node an_asn2fasta + # application: asn2fasta -i translated.asn -op out/proteins.fa -on out/nucs.fa -serial text -type seq-entry + Convert_Annotations_To_Gff: + label: "Convert Annotations To Gff" + run: ../progs/gp_annot_format.cwl + in: + input: Remove_Extraneous_Protein_Ids/entries + ifmt: + default: seq-entry + t: + default: true + ofmt: + default: gff3 + oname: + default: 'annot.gff' + exclude_external: + default: true + out: [output] + + # tasktype: annot_to_gff + # action node class: CAnnotFormatActionNode + # action node: an_annot_format + # application: gp_annot_format -ifmt seq-entry -input-manifest inp/entries.mft -o out/annot.gff -ofmt gff3 -output-manifest out/gff.mft -exclude-external -t + AMR_report: + label: "AMR Report" + run: ../task_types/tt_amr_finder_plus.cwl + in: + proteins: Get_Fasta/prot_fasta + gff: Convert_Annotations_To_Gff/output + nucleotides: Get_Fasta/nuc_fasta + database: database + taxon_db: taxon_db + taxid: taxid + organism: tax_group_name + out: [report] + # tasktype: amr_finder_plus + # action node: an_amr_plus + # action node class: CAmrPlusActionNode + # application GPC: amr_finder_plus -executable ${GP_HOME}/third-party/AMRFinderPlus/amrfinder -database-location ${GP_HOME}/third-party/data/AMRFinderPlus -special-organisms Salmonella,Escherichia|Shigella,Campylobacter + # not sure why '=' delimiters are applied below, according to --help, they + # are not needed. + # + # application: amrfinder '--protein=brd/get_fasta_from_asn.258858932/out/proteins.fa' '--gff=brd/annot_to_gff.258858942/out/annot.gff' '--nucleotide=brd/get_fasta_from_asn.258858932/out/nucs.fa' '--database=/netmnt/vast01/gp/ThirdParty/ExternalData/AMRFinderPlus/2023-04-17.1' --threads 1 '--output=brd/amr_finder_plus.258858952/out/output.tsv' --plus --gpipe_org --pgap + # Complications: + # - need to install new third party application amr_finder_plus (ticket opened) + # - need to install new third party database amr_finder_plus (ticket opened) + # currently only dead symlinks to VAST gp/ThirdParty area + # - uses PATH to pass some info to application + # - action node specifies parameters conditional on "organism" parameter + # GPIPE_REGR_BCT does not have it set, but Pathogen production on the cloud + # will. + # - action node will be implemented as a separate application (ticket opened) + + + Map_Contig_and_Protein_Ids: + label: "Map Contig and Protein Ids" + run: ../progs/map_amr_ids.cwl + # Task Type: map_amr_ids + # GCP parameters: map_amr_ids -gencoll-id ${GP_gencoll_release} -id-mappings ${output}/id_mappings.tsv -input-manifest ${input.report} -o ${output}/report.tsv -dryrun -load-to-database + # action node: CMapAmrIdsActionNode + # application: map_amr_ids -gencoll-id 37054708 -id-mappings out/id_mappings.tsv -input-manifest inp/report.mft -o out/report.tsv -dryrun -load-to-database + + in: + report: AMR_report/report + gencoll_id: + id_mappings_outname: + default: 'id_mappings.tsv' + report_outname: + default: 'modified_report.tsv' + dryrun: + default: true + load_to_database: + default: false + out: [report] + + +outputs: + amr_report: + type: File + outputSource: + Map_Contig_and_Protein_Ids/report + diff --git a/expr/supplemental_data_split_dir.cwl b/expr/supplemental_data_split_dir.cwl index 8e47673..e3deb1a 100644 --- a/expr/supplemental_data_split_dir.cwl +++ b/expr/supplemental_data_split_dir.cwl @@ -33,6 +33,9 @@ expression: | case 'AntiFamLib': r['AntiFamLib'] = l[i]; break; + case 'AMRFinderPlus': + r['amr_finder_plus_database'] = l[i]; + break; case 'asn2pas.xsl': r['asn2pas_xsl'] = l[i]; break; @@ -146,6 +149,8 @@ outputs: type: Directory all_order_specific_blastdb_file: type: File + amr_finder_plus_database: + type: File asn2pas_xsl: type: File identification_db_dir: diff --git a/progs/amr_finder_plus.cwl b/progs/amr_finder_plus.cwl new file mode 100644 index 0000000..fac9c5e --- /dev/null +++ b/progs/amr_finder_plus.cwl @@ -0,0 +1,45 @@ +cwlVersion: v1.2 +label: "amr_finder_plus" + +class: CommandLineTool +baseCommand: amrfinder +inputs: + nucleotide: + type: File + inputBinding: + prefix: --nucleotide + database: + type: Directory + inputBinding: + prefix: --database + threads: + type: int + default: 1 + inputBinding: + prefix: --threads + output_name: + type: string + default: 'output.tsv' + inputBinding: + prefix: --output + plus: + type: boolean? + inputBinding: + prefix: --plus + gpipe_org: + type: boolean? + inputBinding: + prefix: --gpipe_org + pgap: + type: boolean? + inputBinding: + prefix: --pgap + organism: + type: string? + inputBinding: + prefix: --organism +outputs: + report: + type: File + outputBinding: + glob: $(inputs.output_name) diff --git a/progs/amr_plus_adaptor.cwl b/progs/amr_plus_adaptor.cwl new file mode 100644 index 0000000..388cb09 --- /dev/null +++ b/progs/amr_plus_adaptor.cwl @@ -0,0 +1,30 @@ +cwlVersion: v1.2 +label: "amr_plus_adaptor" + + +class: CommandLineTool +baseCommand: amr_plus_adaptor +# ~/gpipe-arch-bin/amr_plus_adaptor -organism Staphylococcus_aureus -taxid 1280 -out-organism-parameter-file out-organism-parameter-file -taxon-db taxonomy.sqlite3 +inputs: + taxon_db: + type: File + inputBinding: + prefix: -taxon-db + organism: + type: string? + inputBinding: + prefix: -organism + taxid: + type: int + inputBinding: + prefix: -taxid + out_organism_parameter_file: + type: string + default: 'organism_parameter.txt' + inputBinding: + prefix: -out-organism-parameter-file +outputs: + organism_parameter_in_file: + type: File + outputBinding: + glob: $(inputs.out_organism_parameter_file) diff --git a/progs/asn_adjust.cwl b/progs/asn_adjust.cwl new file mode 100755 index 0000000..75fd5bd --- /dev/null +++ b/progs/asn_adjust.cwl @@ -0,0 +1,26 @@ +cwlVersion: v1.2 +label: "asn_adjust" +requirements: + - class: InitialWorkDirRequirement + listing: + - entry: $(inputs.input) + writable: True + +class: CommandLineTool +baseCommand: asn_adjust +arguments: ['-fix-prots-to-gnl-id','-t', '-output-path', 'out'] + +inputs: + input: + type: File + inputBinding: + prefix: -input + +outputs: + entries: + type: File + outputBinding: + glob: $(inputs.input.basename) + + + diff --git a/progs/map_amr_ids.cwl b/progs/map_amr_ids.cwl new file mode 100644 index 0000000..638a8b4 --- /dev/null +++ b/progs/map_amr_ids.cwl @@ -0,0 +1,51 @@ +cwlVersion: v1.2 +label: "map_amr_ids" + +class: CommandLineTool +baseCommand: map_amr_ids +requirements: + - class: InlineJavascriptRequirement + - class: ResourceRequirement + ramMax: 3000 + - class: InitialWorkDirRequirement + listing: + - entry: $(inputs.report) + writable: False + + + +inputs: + report: + type: File + inputBinding: + prefix: -input + gencoll_id: + type: int + inputBinding: + prefix: -gencoll-id + dryrun: + type: boolean? + inputBinding: + prefix: -dryrun + load_to_database: + type: boolean? + inputBinding: + prefix: -load-to-database + report_outname: + type: string + inputBinding: + prefix: -o +outputs: + report: + type: File + outputBinding: + outputEval: | + ${ + var newoutput = new File(inputs.outname); + if(newoutput.exists()) { + return newoutput; + } + else { + return inputs.report; + } + } diff --git a/task_types/tt_amr_finder_plus.cwl b/task_types/tt_amr_finder_plus.cwl new file mode 100644 index 0000000..51ba82c --- /dev/null +++ b/task_types/tt_amr_finder_plus.cwl @@ -0,0 +1,53 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.2 +label: "amr_finder_plus" +class: Workflow + +inputs: + gff: File + nucleotides: File + proteins: File + database: Directory + # used by an_amr_plus as build.org.taxid, we need to pass this from the very top + taxid: int + organism: string + taxon_db: File + +outputs: + report: + type: File + outputSource: + proper_application_step/report +steps: + action_node_equivalent: + run: ../progs/amr_plus_adaptor.cwl + in: + organism: organism + taxid: taxid + taxon_db: taxon_db + out: [organism_parameter_in_file] + # example of passing things from step action_node_equivalent + organism_parameter: + run: ../progs/file2string.cwl + in: + input: action_node_equivalent/organism_parameter_in_file + out: [value] + + proper_application_step: + run: ../progs/amr_finder_plus.cwl + in: + proteins: proteins + gff: gff + nucleotide: nucleotides + organism: organism_parameter/value + + database: database + plus: + default: true + gpipe_org: + default: true + pgap: + default: true + out: [report] + + diff --git a/task_types/tt_kmer_compare_wnode.cwl b/task_types/tt_kmer_compare_wnode.cwl index 70d9284..bbf37d5 100644 --- a/task_types/tt_kmer_compare_wnode.cwl +++ b/task_types/tt_kmer_compare_wnode.cwl @@ -36,7 +36,7 @@ steps: num_partitions: default: 1 output: - default: "distances.##.gz" + default: "distances.##" output_glob: default: "distances.*.gz" unzip: diff --git a/task_types/tt_kmer_ref_compare_wnode.cwl b/task_types/tt_kmer_ref_compare_wnode.cwl index f59ee46..0daf1be 100644 --- a/task_types/tt_kmer_ref_compare_wnode.cwl +++ b/task_types/tt_kmer_ref_compare_wnode.cwl @@ -43,7 +43,7 @@ steps: num_partitions: default: 1 output: - default: "distances.##.gz" + default: "distances.##" output_glob: default: "distances.*.gz" unzip: diff --git a/wf_common.cwl b/wf_common.cwl index 3832b59..f200519 100755 --- a/wf_common.cwl +++ b/wf_common.cwl @@ -120,6 +120,7 @@ steps: - 23s_model_path - AntiFamLib - all_order_specific_blastdb_file + - amr_finder_plus_database - asn2pas_xsl - identification_db_dir - CDDdata2 @@ -561,8 +562,18 @@ steps: # # tasktype coded, input/output matches # # application not coded # ############################################### - # # AMR plane is for later stages skipping + # # AMR plane # ############################################### + AMR_naming: + run: amr_naming/wf_amr_naming.cwl + in: + annotation: bacterial_annot_4/out_annotation + # aka Bacterial_Annot_Filter/out_annotation + database: passdata/amr_finder_plus_database + passdata: passdata/taxon_db + taxid: taxid + out: [amr_report] + bacterial_orthology_conditional: run: bacterial_orthology/wf_bacterial_orthology_conditional.cwl in: @@ -1062,4 +1073,7 @@ outputs: checkm_results: type: File outputSource: checkm/checkm_results + amr_report: + type: File + outputSource: AMR_naming/amr_report From c1ec29351a824b0773f55b46175a04a5089e297d Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Thu, 24 Aug 2023 15:18:35 -0400 Subject: [PATCH 17/26] Revert "remove .gz suffix in gpx_make_outputs -output it is now added automatically; JIRA: PGAPX-1180" This reverts commit 6469dd8af5489e8b9da92bdac053ecd6659ee7ae. --- amr_naming/wf_amr_naming.cwl | 136 ----------------------- expr/supplemental_data_split_dir.cwl | 5 - progs/amr_finder_plus.cwl | 45 -------- progs/amr_plus_adaptor.cwl | 30 ----- progs/asn_adjust.cwl | 26 ----- progs/map_amr_ids.cwl | 51 --------- task_types/tt_amr_finder_plus.cwl | 53 --------- task_types/tt_kmer_compare_wnode.cwl | 2 +- task_types/tt_kmer_ref_compare_wnode.cwl | 2 +- wf_common.cwl | 16 +-- 10 files changed, 3 insertions(+), 363 deletions(-) delete mode 100755 amr_naming/wf_amr_naming.cwl delete mode 100644 progs/amr_finder_plus.cwl delete mode 100644 progs/amr_plus_adaptor.cwl delete mode 100755 progs/asn_adjust.cwl delete mode 100644 progs/map_amr_ids.cwl delete mode 100644 task_types/tt_amr_finder_plus.cwl diff --git a/amr_naming/wf_amr_naming.cwl b/amr_naming/wf_amr_naming.cwl deleted file mode 100755 index 14c9cd4..0000000 --- a/amr_naming/wf_amr_naming.cwl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env cwl-runner -label: "Naming AMR Genes Plane" -cwlVersion: v1.2 -class: Workflow -requirements: - - class: SubworkflowFeatureRequirement - - class: MultipleInputFeatureRequirement - -inputs: - annotation: - type: File - taxon_db: - type: File - taxid: - type: int - database: - type: Directory - tax_group_name: - type: string -steps: - Prepare_AMR_Annotation_Input: - label: "Prepare AMR Annotation Input" - run: ../progs/asn_translator.cwl - in: - input: annotation - output_output: {default: 'annotation.asn'} - out: [output] - Remove_Extraneous_Protein_Ids: - label: "Remove Extraneous Protein Ids" - run: ../progs/asn_adjust.cwl - # tasktype adjust_entries - # action node class: CAdjustActionNode - # action node an_adjust_node - # action node complication: pass input if prog generates no output - # asn_adjust -input-manifest inp/entries.mft -output-path out -fix-prots-to-gnl-id -t - # creates translated.asn if creates - in: - input: Prepare_AMR_Annotation_Input/output - out: [entries] - Get_Fasta: - label: "Get Fasta" - run: ../progs/asn2fasta.cwl - in: - i: Remove_Extraneous_Protein_Ids/entries - type: - default: seq-entry - prot_fasta_name: - default: proteins.fa - nuc_fasta_name: - default: nucs.fa - - out: [nuc_fasta, prot_fasta] - # tasktype get_fasta_from_asn - # action node class: CAsn2FastaActionNode - # action node an_asn2fasta - # application: asn2fasta -i translated.asn -op out/proteins.fa -on out/nucs.fa -serial text -type seq-entry - Convert_Annotations_To_Gff: - label: "Convert Annotations To Gff" - run: ../progs/gp_annot_format.cwl - in: - input: Remove_Extraneous_Protein_Ids/entries - ifmt: - default: seq-entry - t: - default: true - ofmt: - default: gff3 - oname: - default: 'annot.gff' - exclude_external: - default: true - out: [output] - - # tasktype: annot_to_gff - # action node class: CAnnotFormatActionNode - # action node: an_annot_format - # application: gp_annot_format -ifmt seq-entry -input-manifest inp/entries.mft -o out/annot.gff -ofmt gff3 -output-manifest out/gff.mft -exclude-external -t - AMR_report: - label: "AMR Report" - run: ../task_types/tt_amr_finder_plus.cwl - in: - proteins: Get_Fasta/prot_fasta - gff: Convert_Annotations_To_Gff/output - nucleotides: Get_Fasta/nuc_fasta - database: database - taxon_db: taxon_db - taxid: taxid - organism: tax_group_name - out: [report] - # tasktype: amr_finder_plus - # action node: an_amr_plus - # action node class: CAmrPlusActionNode - # application GPC: amr_finder_plus -executable ${GP_HOME}/third-party/AMRFinderPlus/amrfinder -database-location ${GP_HOME}/third-party/data/AMRFinderPlus -special-organisms Salmonella,Escherichia|Shigella,Campylobacter - # not sure why '=' delimiters are applied below, according to --help, they - # are not needed. - # - # application: amrfinder '--protein=brd/get_fasta_from_asn.258858932/out/proteins.fa' '--gff=brd/annot_to_gff.258858942/out/annot.gff' '--nucleotide=brd/get_fasta_from_asn.258858932/out/nucs.fa' '--database=/netmnt/vast01/gp/ThirdParty/ExternalData/AMRFinderPlus/2023-04-17.1' --threads 1 '--output=brd/amr_finder_plus.258858952/out/output.tsv' --plus --gpipe_org --pgap - # Complications: - # - need to install new third party application amr_finder_plus (ticket opened) - # - need to install new third party database amr_finder_plus (ticket opened) - # currently only dead symlinks to VAST gp/ThirdParty area - # - uses PATH to pass some info to application - # - action node specifies parameters conditional on "organism" parameter - # GPIPE_REGR_BCT does not have it set, but Pathogen production on the cloud - # will. - # - action node will be implemented as a separate application (ticket opened) - - - Map_Contig_and_Protein_Ids: - label: "Map Contig and Protein Ids" - run: ../progs/map_amr_ids.cwl - # Task Type: map_amr_ids - # GCP parameters: map_amr_ids -gencoll-id ${GP_gencoll_release} -id-mappings ${output}/id_mappings.tsv -input-manifest ${input.report} -o ${output}/report.tsv -dryrun -load-to-database - # action node: CMapAmrIdsActionNode - # application: map_amr_ids -gencoll-id 37054708 -id-mappings out/id_mappings.tsv -input-manifest inp/report.mft -o out/report.tsv -dryrun -load-to-database - - in: - report: AMR_report/report - gencoll_id: - id_mappings_outname: - default: 'id_mappings.tsv' - report_outname: - default: 'modified_report.tsv' - dryrun: - default: true - load_to_database: - default: false - out: [report] - - -outputs: - amr_report: - type: File - outputSource: - Map_Contig_and_Protein_Ids/report - diff --git a/expr/supplemental_data_split_dir.cwl b/expr/supplemental_data_split_dir.cwl index e3deb1a..8e47673 100644 --- a/expr/supplemental_data_split_dir.cwl +++ b/expr/supplemental_data_split_dir.cwl @@ -33,9 +33,6 @@ expression: | case 'AntiFamLib': r['AntiFamLib'] = l[i]; break; - case 'AMRFinderPlus': - r['amr_finder_plus_database'] = l[i]; - break; case 'asn2pas.xsl': r['asn2pas_xsl'] = l[i]; break; @@ -149,8 +146,6 @@ outputs: type: Directory all_order_specific_blastdb_file: type: File - amr_finder_plus_database: - type: File asn2pas_xsl: type: File identification_db_dir: diff --git a/progs/amr_finder_plus.cwl b/progs/amr_finder_plus.cwl deleted file mode 100644 index fac9c5e..0000000 --- a/progs/amr_finder_plus.cwl +++ /dev/null @@ -1,45 +0,0 @@ -cwlVersion: v1.2 -label: "amr_finder_plus" - -class: CommandLineTool -baseCommand: amrfinder -inputs: - nucleotide: - type: File - inputBinding: - prefix: --nucleotide - database: - type: Directory - inputBinding: - prefix: --database - threads: - type: int - default: 1 - inputBinding: - prefix: --threads - output_name: - type: string - default: 'output.tsv' - inputBinding: - prefix: --output - plus: - type: boolean? - inputBinding: - prefix: --plus - gpipe_org: - type: boolean? - inputBinding: - prefix: --gpipe_org - pgap: - type: boolean? - inputBinding: - prefix: --pgap - organism: - type: string? - inputBinding: - prefix: --organism -outputs: - report: - type: File - outputBinding: - glob: $(inputs.output_name) diff --git a/progs/amr_plus_adaptor.cwl b/progs/amr_plus_adaptor.cwl deleted file mode 100644 index 388cb09..0000000 --- a/progs/amr_plus_adaptor.cwl +++ /dev/null @@ -1,30 +0,0 @@ -cwlVersion: v1.2 -label: "amr_plus_adaptor" - - -class: CommandLineTool -baseCommand: amr_plus_adaptor -# ~/gpipe-arch-bin/amr_plus_adaptor -organism Staphylococcus_aureus -taxid 1280 -out-organism-parameter-file out-organism-parameter-file -taxon-db taxonomy.sqlite3 -inputs: - taxon_db: - type: File - inputBinding: - prefix: -taxon-db - organism: - type: string? - inputBinding: - prefix: -organism - taxid: - type: int - inputBinding: - prefix: -taxid - out_organism_parameter_file: - type: string - default: 'organism_parameter.txt' - inputBinding: - prefix: -out-organism-parameter-file -outputs: - organism_parameter_in_file: - type: File - outputBinding: - glob: $(inputs.out_organism_parameter_file) diff --git a/progs/asn_adjust.cwl b/progs/asn_adjust.cwl deleted file mode 100755 index 75fd5bd..0000000 --- a/progs/asn_adjust.cwl +++ /dev/null @@ -1,26 +0,0 @@ -cwlVersion: v1.2 -label: "asn_adjust" -requirements: - - class: InitialWorkDirRequirement - listing: - - entry: $(inputs.input) - writable: True - -class: CommandLineTool -baseCommand: asn_adjust -arguments: ['-fix-prots-to-gnl-id','-t', '-output-path', 'out'] - -inputs: - input: - type: File - inputBinding: - prefix: -input - -outputs: - entries: - type: File - outputBinding: - glob: $(inputs.input.basename) - - - diff --git a/progs/map_amr_ids.cwl b/progs/map_amr_ids.cwl deleted file mode 100644 index 638a8b4..0000000 --- a/progs/map_amr_ids.cwl +++ /dev/null @@ -1,51 +0,0 @@ -cwlVersion: v1.2 -label: "map_amr_ids" - -class: CommandLineTool -baseCommand: map_amr_ids -requirements: - - class: InlineJavascriptRequirement - - class: ResourceRequirement - ramMax: 3000 - - class: InitialWorkDirRequirement - listing: - - entry: $(inputs.report) - writable: False - - - -inputs: - report: - type: File - inputBinding: - prefix: -input - gencoll_id: - type: int - inputBinding: - prefix: -gencoll-id - dryrun: - type: boolean? - inputBinding: - prefix: -dryrun - load_to_database: - type: boolean? - inputBinding: - prefix: -load-to-database - report_outname: - type: string - inputBinding: - prefix: -o -outputs: - report: - type: File - outputBinding: - outputEval: | - ${ - var newoutput = new File(inputs.outname); - if(newoutput.exists()) { - return newoutput; - } - else { - return inputs.report; - } - } diff --git a/task_types/tt_amr_finder_plus.cwl b/task_types/tt_amr_finder_plus.cwl deleted file mode 100644 index 51ba82c..0000000 --- a/task_types/tt_amr_finder_plus.cwl +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env cwl-runner -cwlVersion: v1.2 -label: "amr_finder_plus" -class: Workflow - -inputs: - gff: File - nucleotides: File - proteins: File - database: Directory - # used by an_amr_plus as build.org.taxid, we need to pass this from the very top - taxid: int - organism: string - taxon_db: File - -outputs: - report: - type: File - outputSource: - proper_application_step/report -steps: - action_node_equivalent: - run: ../progs/amr_plus_adaptor.cwl - in: - organism: organism - taxid: taxid - taxon_db: taxon_db - out: [organism_parameter_in_file] - # example of passing things from step action_node_equivalent - organism_parameter: - run: ../progs/file2string.cwl - in: - input: action_node_equivalent/organism_parameter_in_file - out: [value] - - proper_application_step: - run: ../progs/amr_finder_plus.cwl - in: - proteins: proteins - gff: gff - nucleotide: nucleotides - organism: organism_parameter/value - - database: database - plus: - default: true - gpipe_org: - default: true - pgap: - default: true - out: [report] - - diff --git a/task_types/tt_kmer_compare_wnode.cwl b/task_types/tt_kmer_compare_wnode.cwl index bbf37d5..70d9284 100644 --- a/task_types/tt_kmer_compare_wnode.cwl +++ b/task_types/tt_kmer_compare_wnode.cwl @@ -36,7 +36,7 @@ steps: num_partitions: default: 1 output: - default: "distances.##" + default: "distances.##.gz" output_glob: default: "distances.*.gz" unzip: diff --git a/task_types/tt_kmer_ref_compare_wnode.cwl b/task_types/tt_kmer_ref_compare_wnode.cwl index 0daf1be..f59ee46 100644 --- a/task_types/tt_kmer_ref_compare_wnode.cwl +++ b/task_types/tt_kmer_ref_compare_wnode.cwl @@ -43,7 +43,7 @@ steps: num_partitions: default: 1 output: - default: "distances.##" + default: "distances.##.gz" output_glob: default: "distances.*.gz" unzip: diff --git a/wf_common.cwl b/wf_common.cwl index f200519..3832b59 100755 --- a/wf_common.cwl +++ b/wf_common.cwl @@ -120,7 +120,6 @@ steps: - 23s_model_path - AntiFamLib - all_order_specific_blastdb_file - - amr_finder_plus_database - asn2pas_xsl - identification_db_dir - CDDdata2 @@ -562,18 +561,8 @@ steps: # # tasktype coded, input/output matches # # application not coded # ############################################### - # # AMR plane + # # AMR plane is for later stages skipping # ############################################### - AMR_naming: - run: amr_naming/wf_amr_naming.cwl - in: - annotation: bacterial_annot_4/out_annotation - # aka Bacterial_Annot_Filter/out_annotation - database: passdata/amr_finder_plus_database - passdata: passdata/taxon_db - taxid: taxid - out: [amr_report] - bacterial_orthology_conditional: run: bacterial_orthology/wf_bacterial_orthology_conditional.cwl in: @@ -1073,7 +1062,4 @@ outputs: checkm_results: type: File outputSource: checkm/checkm_results - amr_report: - type: File - outputSource: AMR_naming/amr_report From 177e92fdd6402b052db78fa3a58a933dc1abd85c Mon Sep 17 00:00:00 2001 From: "Badretdin, Azat" Date: Thu, 24 Aug 2023 15:26:05 -0400 Subject: [PATCH 18/26] remove .gz suffix in gpx_make_outputs -output it is now added automatically; JIRA: PGAPX-1180 --- task_types/tt_kmer_compare_wnode.cwl | 2 +- task_types/tt_kmer_ref_compare_wnode.cwl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/task_types/tt_kmer_compare_wnode.cwl b/task_types/tt_kmer_compare_wnode.cwl index 70d9284..bbf37d5 100644 --- a/task_types/tt_kmer_compare_wnode.cwl +++ b/task_types/tt_kmer_compare_wnode.cwl @@ -36,7 +36,7 @@ steps: num_partitions: default: 1 output: - default: "distances.##.gz" + default: "distances.##" output_glob: default: "distances.*.gz" unzip: diff --git a/task_types/tt_kmer_ref_compare_wnode.cwl b/task_types/tt_kmer_ref_compare_wnode.cwl index f59ee46..0daf1be 100644 --- a/task_types/tt_kmer_ref_compare_wnode.cwl +++ b/task_types/tt_kmer_ref_compare_wnode.cwl @@ -43,7 +43,7 @@ steps: num_partitions: default: 1 output: - default: "distances.##.gz" + default: "distances.##" output_glob: default: "distances.*.gz" unzip: From 05183135e8489f4fe0661557c42ec16ac4d19c73 Mon Sep 17 00:00:00 2001 From: ericjove <123645716+ericjove@users.noreply.github.com> Date: Thu, 31 Aug 2023 13:35:09 -0400 Subject: [PATCH 19/26] JIRA PGAPX-1169 added changes to enable\fix absolute paths for the input arg -g fasta files. --- scripts/pgap.py | 1096 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 1095 insertions(+), 1 deletion(-) diff --git a/scripts/pgap.py b/scripts/pgap.py index 194eb14..70a32f4 100755 --- a/scripts/pgap.py +++ b/scripts/pgap.py @@ -847,6 +847,1095 @@ def remove_empty_files(rootdir): if os.path.isfile(fullname) and os.path.getsize(fullname) == 0: quiet_remove(fullname) +def copy_genome_to_workspace(genome, original_workspace): + + # Check if the input file actually exists + if not os.path.exists(genome): + print(f"Error: The input genome file:{genome} does not exist.") + sys.exit(1) # Exit the script with an error code + + filename = os.path.basename(genome) + new_genome_path = os.path.join(original_workspace, filename) + + # Check if the file with the same name already exists in the workspace + if os.path.exists(new_genome_path): + return filename + + try: + # Attempt to copy the file + shutil.copy2(genome, new_genome_path) + except FileNotFoundError: + print("Error: The genome file {genome} does not exist.") + sys.exit(1) # Exit the script with an error code + + return filename + +def create_simple_input_yaml_file(fasta_location, genus_species, output_filename='input.yaml'): + # Note: The args are not validated here, as they are validated when the generated YAML files are ingested in the pipeline. + submol_content = f'''\ +organism: + genus_species: {genus_species} +''' + with open('submol.yaml', 'w') as f: + f.write(submol_content) + + yaml_content = f'''\ +fasta: + class: File + location: {fasta_location} +submol: + class: File + location: submol.yaml +''' + with open(output_filename, 'w') as f: + f.write(yaml_content) + + return os.path.abspath(output_filename) + +def main(): + + parser = argparse.ArgumentParser(description="Input must be provided as:\n" + " 1. a fasta/organism pair, e.g.\n" + " pgap.py ... -g input.fasta -s 'Escherichia coli'\n" + "or\n" + " 2. a YAML configuration file, e.g.\n" + " pgap.py ... input.yaml\n" + , formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument('-g', '--genome', type=str, help='Path to genomic fasta') + + parser.add_argument('-s', '--organism', type=str, help='Binomial name') + parser.add_argument('input', nargs='?', help=argparse.SUPPRESS) + + + parser.add_argument('-V', '--version', action='store_true', + help='Print currently set up PGAP version') + parser.add_argument('-v', '--verbose', action='store_true', + help='Verbose mode') + + parser.add_argument('--dev', action='store_true', help=argparse.SUPPRESS) # help="Set development mode") + parser.add_argument('--test', action='store_true', help=argparse.SUPPRESS) # help="Set test mode") + parser.add_argument('--prod', action='store_true', help=argparse.SUPPRESS) # help="Use a production candidate version. For internal testing." + + ani_group = parser.add_mutually_exclusive_group() + ani_group.add_argument('--taxcheck', dest='ani', action='store_true', help="Also calculate the Average Nucleotide Identity") + ani_group.add_argument('--taxcheck-only', dest='ani_only', action='store_true', help="Only calculate the Average Nucleotide Identity, do not run PGAP") + + parser.add_argument("--auto-correct-tax", + dest='auto_correct_tax', + action='store_true', + help='Use the ANI predicted organism instead of the user-provided organism; requires --taxcheck.') + action_group = parser.add_mutually_exclusive_group() + action_group.add_argument('-l', '--list', action='store_true', help='List available versions.') + action_group.add_argument('-u', '--update', dest='update', action='store_true', + help='Update to the latest PGAP version, including reference data.') + action_group.add_argument('--use-version', dest='use_version', help=argparse.SUPPRESS) + + report_group = parser.add_mutually_exclusive_group() + report_group.add_argument('-r', '--report-usage-true', dest='report_usage_true', action='store_true', + help='Report anonymized usage metadata to NCBI.') + report_group.add_argument('-n', '--report-usage-false', dest='report_usage_false', action='store_true', + help='Do not report anonymized usage metadata to NCBI.') + parser.add_argument("--container-name", + dest='container_name', + help='Specify a container name that will be used instead of automatically generated.') + parser.add_argument("--container-path", + dest='container_path', + help='Override path to image.') + parser.add_argument("--ignore-all-errors", + dest='ignore_all_errors', + action='store_true', + help='Ignore errors from quality control analysis, in order to obtain a draft annotation.') + parser.add_argument("--no-internet", + dest='no_internet', + action='store_true', + help='Disable internet access for all programs in pipeline.') + parser.add_argument('-D', '--docker', metavar='path', + help='Docker-compatible executable (e.g. docker, podman, apptainer), which may include a full path like /usr/bin/docker') + parser.add_argument('-o', '--output', metavar='path', default='output', + help='Output directory to be created, which may include a full path') + parser.add_argument('-t', '--timeout', default='24:00:00', help=argparse.SUPPRESS) + #help='Set a maximum time for pipeline to run, format is D:H:M:S, H:M:S, or M:S, or S (default: %(default)s)') + parser.add_argument('-q', '--quiet', action='store_true', + help='Quiet mode, for scripts') + parser.add_argument('--no-self-update', action='store_true', + dest='no_self_up', + help='Do not attempt to update this script') + parser.add_argument('-c', '--cpus', type=int, + help='Limit the number of CPUs available for execution by the container') + parser.add_argument('-m', '--memory', + help='Memory limit (Docker and PodMan only, ignored on Singularity); may add an optional suffix which can be one of b, k, m, or g') + parser.add_argument('--teamcity', action='store_true', help=argparse.SUPPRESS) + parser.add_argument('-d', '--debug', action='store_true', + help='Debug mode') + + args = parser.parse_args() + + # const storing the initial working directory. + # Please do not modify this variable's value. + ORIGINAL_WORKSPACE = os.getcwd() + + if ( (args.input or args.genome) and (not args.report_usage_true) and (not args.report_usage_false) ): + parser.error("One of -n/--report-usage-false or -r/--report-usage-true must be provided.") + + # Check for the different no_yaml_group arguments scenarios. + if (args.genome and not args.organism) or (not args.genome and args.organism): + parser.error("Invalid Command Line Argument Error: Both arguments -s\--organism and -g\--genome must be provided if no YAML file is provided.") + elif not args.input and args.genome and args.organism: + args.genome = copy_genome_to_workspace(args.genome, ORIGINAL_WORKSPACE) + args.input = create_simple_input_yaml_file(args.genome, args.organism) + elif args.input and args.genome and args.organism: + parser.error("Invalid Command Line Argument Error: A YAML file argument cannot be used " + "in combination with either the -s/--organism or -g/--genome arguments. " + "The -s/--organism and the -g/--genome arguments replace the YAML file argument input.") + + retcode = 0 + try: + params = Setup(args) + if args.input: + if args.ani or args.ani_only: + p = Pipeline(params, args.input, "taxcheck") + retcode = p.launch() + p.cleanup() + # args.output for some reason not always available + time.sleep(1) + # analyze ani output here + print (f"DEBUG: args.output = {args.output}") + print (f"DEBUG: params.outputdir = {params.outputdir}") + outputdir = args.output # this does not work + outputdir = params.outputdir + if not os.path.exists(outputdir): + print("INTERNAL(SYSTEM)PROBLEM: abort: output directory does not exist: {}".format(outputdir)) + if args.ignore_all_errors == False: + sys.exit(1) + else: + print("Ignoring") + params.ani_output = os.path.join(outputdir, "ani-tax-report.xml") + params.ani_hr_output = os.path.join(outputdir, "ani-tax-report.txt") + if os.path.exists(params.ani_output) and os.path.getsize(params.ani_output) > 0: + True + else: + params.ani_output = None + if os.path.exists(params.ani_hr_output) and os.path.getsize(params.ani_hr_output) > 0: + True + else: + params.ani_hr_output = None + + errors_xml_fn = os.path.join(outputdir, "errors.xml") + # if there are errors + # and we do not want to recover them when it is recoverable + # then bail + if os.path.exists(errors_xml_fn) and os.path.getsize(errors_xml_fn) > 0 and not ( args.auto_correct_tax and params.ani_output != None ) : + error_file = None + if params.ani_hr_output != None: + error_file = params.ani_hr_output + elif params.ani_output != None: + error_file = params.ani_output + else: + error_file = errors_xml_fn + print("ERROR: taxcheck calls the genome misassigned or contaminated. See {}".format(error_file)) + if args.ignore_all_errors == False: + print("thus PGAP will not execute") + sys.exit(1) + else: + print("Ignoring") + + if not args.ani_only: + p = Pipeline(params, args.input, "pgap") + retcode = p.launch() + p.cleanup() + outputdir = p.params.outputdir + if retcode == 0: + for errors_xml_fn in glob.glob(os.path.join(outputdir, "errors.xml")): + os.remove(errors_xml_fn) + if(p.submol != None): + submol_modified = os.path.join(outputdir, p.submol) + if os.path.exists(submol_modified): + os.remove(submol_modified) + remove_empty_files(outputdir) + + except (Exception, KeyboardInterrupt) as exc: + if args.debug: + raise + retcode = 1 + import traceback + traceback.print_exc() + + sys.exit(retcode) + +if __name__== "__main__": + main()#!/usr/bin/env python3 +from __future__ import print_function +import sys + +min_python = (3,6) +try: + assert(sys.version_info >= min_python) +except: + from platform import python_version + print("Python version", python_version(), "is too old.") + print("Please use Python", ".".join(map(str,min_python)), "or later.") + sys.exit() + +import argparse +import atexit +import contextlib +import glob +import json +import multiprocessing as mp +import os +import platform +import queue +import re +import shutil +import subprocess +import tarfile +import time +import tempfile +import xml +import xml.dom.minidom + + +from io import open +from urllib.parse import urlparse, urlencode +from urllib.request import urlopen, urlretrieve, Request +from urllib.error import HTTPError + +def is_venv(): + return (hasattr(sys, 'real_prefix') or + (hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix)) + +class urlopen_progress: + timeout = 60 + retries = 10 + + def __init__(self, url, quiet, teamcity): + self.url = url + self.bytes_so_far = 0 + self.urlopen() + + self.quiet = quiet + self.teamcity = teamcity + if teamcity: + self.EOL = '\n' + else: + self.EOL = '\r' + self.cur_row = -1 + total_size = 0 + total_size = self.remote_file.getheader('Content-Length', 0) # More modern method + + self.total_size = int(total_size) + + def urlopen(self): + headers = dict() + if self.bytes_so_far > 0: + headers['Range'] = 'bytes={}-'.format(self.bytes_so_far) + request = Request(self.url, headers=headers) + self.remote_file = urlopen(request, timeout=self.timeout) + + def read(self, n=8388608): + delay = 1 + for attempt in range(self.retries): + try: + if self.remote_file is None: + self.urlopen() + buffer = self.remote_file.read(n) + if not buffer: + if not self.quiet: + sys.stdout.write('\n') + return '' + break + except Exception as ex: + self.remote_file = None + time.sleep(delay) + delay += delay + + self.bytes_so_far += len(buffer) + percent = float(self.bytes_so_far) / self.total_size + percent = round(percent*100, 2) + + do_print = True + if self.teamcity: + do_print = False + row = int(percent) + if row > self.cur_row: + self.cur_row = row + do_print = True + + if do_print and not self.quiet: + sys.stderr.write("Downloaded %d of %d bytes (%0.2f%%)%s" % (self.bytes_so_far, self.total_size, percent, self.EOL)) + + return buffer + +def install_url(url, path, quiet, teamcity, guard_file): + basename = os.path.basename(urlparse(url).path) + try: + local_file = os.path.join(path, basename) + if os.path.exists(local_file): + if not quiet: + print('Extracting local tarball: {}'.format(local_file)) + fileobj = open(local_file, 'rb') + else: + if not quiet: + print('Downloading and extracting tarball: {}'.format(url)) + fileobj = urlopen_progress(url, quiet, teamcity) + with tarfile.open(mode='r|*', fileobj=fileobj) as tar: + tar.extractall(path=path) + except: + sys.stderr.write(''' +ERROR: Failed to extract tarball; to install manually, try something like: + curl -OLC - {} + tar xvf {} +'''.format(url, basename)) + raise + if guard_file != None: + open(guard_file, 'a').close() + +def quiet_remove(filename): + with contextlib.suppress(FileNotFoundError): + os.remove(filename) + +def find_failed_step(filename): + r = "^\[(?P