Merge remote-tracking branch 'origin/test' into prod

ncbi · Oct 4, 2023 · 592124f · 592124f
2 parents 2388d68 + 2d85168
commit 592124f
Show file tree

Hide file tree

Showing 11 changed files with 164 additions and 45 deletions.
diff --git a/README.md b/README.md
@@ -15,7 +15,13 @@ in 2001 and is regularly upgraded to improve structural and functional
 annotation quality ([Li W, O'Neill KR et al 2021](https://www.ncbi.nlm.nih.gov/pubmed/33270901)). Recent 
 improvements include utilization of curated protein profile hidden Markov models (HMMs), 
 and curated complex domain architectures for functional annotation of proteins and 
-annotation of Enzyme Commission numbers and Gene Ontology terms.
+annotation of Enzyme Commission numbers and Gene Ontology terms. Post-annotation, the 
+completeness of the annotated gene set is estimated with 
+[CheckM](https://pubmed.ncbi.nlm.nih.gov/25977477/).
+
+The workflow provided here also offers the option to confirm or correct the organism
+associated with the genome assembly prior to starting the annotation, using the 
+[Average Nucleotide Identity tool](https://pubmed.ncbi.nlm.nih.gov/29792589/).
 
 Get started by watching this [webinar](https://www.youtube.com/watch?v=pNn_-_46lpI)!
 
@@ -56,6 +62,12 @@ Int J Syst Evol Microbiol. 2018 Jul;68(7):2386-2392.
 Lomsadze A, Gemayel K, Tang S, Borodovsky M.\
 Genome Research. 2018; 28(7):1079-1089.
 
+### CheckM
+[CheckM: assessing the quality of microbial genomes recovered from isolates, single cells, and metagenomes](https://pubmed.ncbi.nlm.nih.gov/25977477/)\
+Parks DH, Imelfort M, Skennerton CT, Hugenholtz P, Tyson GW.\
+Genome Research. 2015; 25(7):1043-1055.
+
+
 ### TIGRFAMs
 
 [TIGRFAMs: a protein family resource for the functional identification of proteins.](https://www.ncbi.nlm.nih.gov/pubmed/11125044)\
@@ -107,6 +119,12 @@ GeneMarkS-2+ is distributed as part of PGAP with limited rights of use
 and redistribution from the Georgia Tech Research Corporation. See the
 [full text of the license](GeneMarkS_Software_License.txt).
 
+### CheckM
+
+GNU General Public License v3.0
+
+Permissions of this strong copyleft license are conditioned on making available complete source code of licensed works and modifications, which include larger works using a licensed work, under the same license. Copyright and license notices must be preserved. Contributors provide an express grant of patent rights. See the [full text of the license](Check-M-license.txt).
+
 ### TIGRFAMs
 
 The original TIGRFAMs database was a research project of the J. Craig

diff --git a/pgap.cwl b/pgap.cwl
@@ -47,7 +47,7 @@ outputs:
     outputSource: standard_pgap/gff
     type: File
   gff_enhanced:
-    outputSource: standard_pgap/gff_enhanced
+    outputSource: Generate_Annotation_Reports_gff_enhanced/output
     type: File
   sqn:
     outputSource: standard_pgap/sqn
@@ -184,5 +184,13 @@ steps:
       no_internet: no_internet
       make_uuid: make_uuid
       uuid_in: uuid_in
-    out: [gbent, gbk, gff, gff_enhanced, nucleotide_fasta, protein_fasta, cds_nucleotide_fasta, cds_protein_fasta, sqn, initial_asndisc_error_diag, initial_asnval_error_diag, final_asndisc_error_diag, final_asnval_error_diag, checkm_raw, checkm_results]
+      blast_hits_cache_data: blast_hits_cache_data
+    out: [gbent, gbk, gff, nucleotide_fasta, protein_fasta, cds_nucleotide_fasta, cds_protein_fasta, sqn, initial_asndisc_error_diag, initial_asnval_error_diag, final_asndisc_error_diag, final_asnval_error_diag, checkm_raw, checkm_results]
     run: wf_common.cwl
+  Generate_Annotation_Reports_gff_enhanced:
+    run: progs/produce_enhanced_gff.cwl
+    in:
+        gff: standard_pgap/gff
+        fasta: fasta
+    out: [output]
+
diff --git a/progs/bact_best_evidence_alignments.cwl b/progs/bact_best_evidence_alignments.cwl
@@ -12,7 +12,7 @@ requirements:
         entry: ${var blob = '# annotation.mft created for bact_best_evidence_alignments from input annotation Array of Files\n'; for (var i = 0; i < inputs.annotation.length; i++) { blob += inputs.annotation[i].path + '\n'; } return blob; }
 
 baseCommand: bact_best_evidence_alignments
-arguments: [-support-threshold, "25.0",-weak-threshold, "20"]
+arguments: [-support-threshold, "25.0",-weak-threshold, "17"]
 inputs:
   annotation:
     type: File[]

diff --git a/progs/kmer_top_identification.cwl b/progs/kmer_top_identification.cwl
@@ -40,6 +40,14 @@ inputs:
     type: float
     inputBinding:
       prefix: -threshold
+  minN:
+    type: int
+    inputBinding:
+      prefix: -minN
+  strict:
+    type: boolean?
+    inputBinding:
+      prefix: -strict
 outputs:
     matches:
         type: File

diff --git a/progs/produce_enhanced_gff.cwl b/progs/produce_enhanced_gff.cwl
@@ -17,7 +17,7 @@ inputs:
       position: 1
   separator:
     type: string?
-    default: '### FASTA'
+    default: '## FASTA'
   separator_file:
     type: string?
     default: 'separator.txt'

diff --git a/scripts/pgap.py b/scripts/pgap.py
@@ -258,8 +258,8 @@ def make_podman_cmd(self):
             log_dir = self.params.outputdir + '/debug/log'
             os.makedirs(log_dir, exist_ok=True)
             self.cmd.extend(['--volume', '{}:/log/srv'.format(log_dir)])
-            if self.params.args.container_name:
-                self.cmd.extend(['--name', self.params.args.container_name])
+        if self.params.args.container_name:
+            self.cmd.extend(['--name', self.params.args.container_name])
         self.cmd.append(self.params.docker_image)
 
     def make_singularity_cmd(self):
@@ -506,7 +506,7 @@ def launch(self):
                         {"file": "initial_asndisc_diag.xml", "remove": True},
                         {"file": "initial_asnval_diag.xml", "remove": True}
                     ]
-                self.report_output_files(self.params.args.output, output_files)
+                self.report_output_files(self.params.outputdir, output_files)
         return proc.returncode
 
 class Setup:
@@ -642,15 +642,9 @@ def get_use_version(self):
     def get_output_dir(self):
         outputdir = os.path.abspath(self.args.output)
         if os.path.exists(outputdir):
-            parent, base = os.path.split(outputdir)
-            counter = 0
-            for sibling in os.listdir(parent):
-                if sibling.startswith(base + '.'):
-                    ext = sibling[len(base)+1:]
-                    if ext.isdecimal():
-                       counter = max(counter, int(ext))
-            outputdir = os.path.join(parent, base+'.'+str(counter+1))
-        return outputdir
+            sys.exit(f"Output directory {outputdir} exists, exiting.")
+        else:
+            return outputdir
 
     def get_docker_info(self):
         docker_type_alternatives = ['docker', 'podman', 'singularity', 'apptainer']
@@ -853,6 +847,29 @@ def remove_empty_files(rootdir):
         if os.path.isfile(fullname) and os.path.getsize(fullname) == 0:
             quiet_remove(fullname)
 
+def copy_genome_to_workspace(genome, original_workspace):
+
+    # Check if the input file actually exists
+    if not os.path.exists(genome):
+        print(f"Error: The input genome file:{genome} does not exist.")
+        sys.exit(1)  # Exit the script with an error code
+
+    filename = os.path.basename(genome)
+    new_genome_path = os.path.join(original_workspace, filename)
+
+    # Check if the file with the same name already exists in the workspace
+    if os.path.exists(new_genome_path):
+        return filename
+
+    try:
+        # Attempt to copy the file
+        shutil.copy2(genome, new_genome_path)
+    except FileNotFoundError:
+        print("Error: The genome file {genome} does not exist.")
+        sys.exit(1)  # Exit the script with an error code
+
+    return filename
+
 def create_simple_input_yaml_file(fasta_location, genus_species, output_filename='input.yaml'):
     # Note: The args are not validated here, as they are validated when the generated YAML files are ingested in the pipeline.
     submol_content = f'''\
@@ -874,7 +891,7 @@ def create_simple_input_yaml_file(fasta_location, genus_species, output_filename
         f.write(yaml_content)
 
     return os.path.abspath(output_filename)
-       
+
 def main():
 
     parser = argparse.ArgumentParser(description="Input must be provided as:\n"
@@ -895,11 +912,10 @@ def main():
                         help='Print currently set up PGAP version')
     parser.add_argument('-v', '--verbose', action='store_true',
                         help='Verbose mode')
-
-    version_group = parser.add_mutually_exclusive_group()
-    version_group.add_argument('--dev',  action='store_true', help=argparse.SUPPRESS) # help="Set development mode")
-    version_group.add_argument('--test', action='store_true', help=argparse.SUPPRESS) # help="Set test mode")
-    version_group.add_argument('--prod', action='store_true', help="Use a production candidate version. For internal testing.")
+
+    parser.add_argument('--dev',  action='store_true', help=argparse.SUPPRESS) # help="Set development mode")
+    parser.add_argument('--test', action='store_true', help=argparse.SUPPRESS) # help="Set test mode")
+    parser.add_argument('--prod', action='store_true', help=argparse.SUPPRESS) # help="Use a production candidate version. For internal testing."
 
     ani_group = parser.add_mutually_exclusive_group()
     ani_group.add_argument('--taxcheck', dest='ani',  action='store_true', help="Also calculate the Average Nucleotide Identity")
@@ -955,10 +971,18 @@ def main():
 
     args = parser.parse_args()
 
+    # const storing the initial working directory.
+    # Please do not modify this variable's value.
+    ORIGINAL_WORKSPACE = os.getcwd()
+
+    if ( (args.input or args.genome) and (not args.report_usage_true) and (not args.report_usage_false) ):
+        parser.error("One of -n/--report-usage-false or -r/--report-usage-true must be provided.")
+
     # Check for the different no_yaml_group arguments scenarios.
     if (args.genome and not args.organism) or (not args.genome and args.organism):
         parser.error("Invalid Command Line Argument Error: Both arguments -s\--organism and -g\--genome must be provided if no YAML file is provided.")
     elif not args.input and args.genome and args.organism:
+        args.genome = copy_genome_to_workspace(args.genome, ORIGINAL_WORKSPACE)
         args.input = create_simple_input_yaml_file(args.genome, args.organism)
     elif args.input and args.genome and args.organism:
         parser.error("Invalid Command Line Argument Error: A YAML file argument cannot be used "
@@ -976,14 +1000,18 @@ def main():
                 # args.output for some reason not always available 
                 time.sleep(1) 
                 # analyze ani output here
-                if not os.path.exists(args.output):
-                    print("INTERNAL(SYSTEM)PROBLEM: abort: output directory does not exist: {}".format(args.output))
+                print (f"DEBUG: args.output = {args.output}")
+                print (f"DEBUG: params.outputdir = {params.outputdir}")
+                outputdir = args.output # this does not work
+                outputdir = params.outputdir
+                if not os.path.exists(outputdir):
+                    print("INTERNAL(SYSTEM)PROBLEM: abort: output directory does not exist: {}".format(outputdir))
                     if  args.ignore_all_errors == False:
                         sys.exit(1)
                     else:
                         print("Ignoring")
-                params.ani_output = os.path.join(args.output, "ani-tax-report.xml")
-                params.ani_hr_output = os.path.join(args.output, "ani-tax-report.txt")
+                params.ani_output = os.path.join(outputdir, "ani-tax-report.xml")
+                params.ani_hr_output = os.path.join(outputdir, "ani-tax-report.txt")
                 if os.path.exists(params.ani_output) and os.path.getsize(params.ani_output) > 0:
                     True
                 else:
@@ -993,7 +1021,7 @@ def main():
                 else:
                     params.ani_hr_output = None 
 
-                errors_xml_fn = os.path.join(args.output, "errors.xml")
+                errors_xml_fn = os.path.join(outputdir, "errors.xml")
                 # if there are errors
                 # and we do not want to recover them when it is recoverable
                 #  then bail
@@ -1011,20 +1039,20 @@ def main():
                         sys.exit(1)
                     else:
                         print("Ignoring")
-                    remove_empty_files(args.output)
 
             if not args.ani_only:
                 p = Pipeline(params, args.input, "pgap")
                 retcode = p.launch()
                 p.cleanup()
+                outputdir = p.params.outputdir 
                 if retcode == 0:
-                    for errors_xml_fn in glob.glob(os.path.join(args.output, "errors.xml")):
+                    for errors_xml_fn in glob.glob(os.path.join(outputdir, "errors.xml")):
                         os.remove(errors_xml_fn)
                     if(p.submol != None):
-                        submol_modified = os.path.join(args.output, p.submol)
+                        submol_modified = os.path.join(outputdir, p.submol)
                         if os.path.exists(submol_modified):
                             os.remove(submol_modified)
-                remove_empty_files(args.output)
+            remove_empty_files(outputdir)
 
     except (Exception, KeyboardInterrupt) as exc:
         if args.debug:

diff --git a/scripts/stdvalsum2single-genome-validation.xsl b/scripts/stdvalsum2single-genome-validation.xsl
@@ -0,0 +1,59 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+        
+        asndisc/asnvalidate summary -> proc_annot_stats
+        
+        proc_annot_stats format of XML is a format that serves as a standard input
+        for subsequent loading to ProkRefseqTracking..ProcessAnnotationStats table
+        the latter XML format is used as a template for classes under valres C++ namespace
+        top element corresponding to ncbi::objects::valres::CValidationResults
+        
+    -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:xs="http://www.w3.org/2001/XMLSchema"
+    exclude-result-prefixes="xs"
+    version="2.0">
+    <xsl:output method="xml" indent="yes" encoding="UTF-8"/>
+    <xsl:variable name="smallcase" select="'abcdefghijklmnopqrstuvwxyz'" />
+    <xsl:variable name="uppercase" select="'ABCDEFGHIJKLMNOPQRSTUVWXYZ'" />
+
+    <xsl:template match="ValidationErrors|DiscrepancyCounts">
+        <html>
+            <body>
+                <table border="1">
+                    <xsl:variable name="caption">
+                        <xsl:if test="name(.) = 'ValidationErrors'">STATISTICS OF ASNVALIDATE DIAGNOSTICS</xsl:if>
+                        <xsl:if test="name(.) = 'DiscrepancyCounts'">STATISTICS OF ASNDISC DIAGNOSTICS</xsl:if>
+                    </xsl:variable>
+                    <caption>
+                        <xsl:value-of select="$caption"/>
+                    </caption>
+                    <tr>
+                        <th>Application/level</th>
+                        <th>Level/code</th>
+                        <th>Count</th>
+                    </tr>
+            <xsl:variable name="application">
+                <xsl:if test="name(.) = 'ValidationErrors'">asnval</xsl:if>
+                <xsl:if test="name(.) = 'DiscrepancyCounts'">asndisc</xsl:if>
+            </xsl:variable>
+            <xsl:for-each select="Counts/Total">
+                <tr>
+                    <td><xsl:value-of select="$application"/></td>
+                    <td><xsl:value-of select='translate(../@Severity, $uppercase , $smallcase)'/></td>
+                    <td><xsl:value-of select="text()"/></td>
+                </tr>
+            </xsl:for-each>
+            <xsl:for-each select="Counts/Subcount">
+                <tr>
+                    <td><xsl:value-of select="$application"/><xsl:text> </xsl:text><xsl:value-of 
+                        select='translate(../@Severity, $uppercase , $smallcase)'/></td>
+                    <td><xsl:value-of select="@Code"/></td>
+                    <td><xsl:value-of select="text()"/></td>
+                </tr>
+            </xsl:for-each>
+                </table>
+            </body>
+        </html>
+    </xsl:template>
+</xsl:stylesheet>
diff --git a/task_types/tt_kmer_compare_wnode.cwl b/task_types/tt_kmer_compare_wnode.cwl
@@ -36,7 +36,7 @@ steps:
             num_partitions: 
                 default: 1
             output: 
-                default: "distances.##.gz"
+                default: "distances.##"
             output_glob:
                 default: "distances.*.gz"
             unzip:

diff --git a/task_types/tt_kmer_ref_compare_wnode.cwl b/task_types/tt_kmer_ref_compare_wnode.cwl
@@ -43,7 +43,7 @@ steps:
             num_partitions: 
                 default: 1
             output: 
-                default: "distances.##.gz"
+                default: "distances.##"
             output_glob:
                 default: "distances.*.gz"
             unzip:

diff --git a/task_types/tt_kmer_top_n.cwl b/task_types/tt_kmer_top_n.cwl
@@ -23,8 +23,12 @@ steps:
         in:
             kmer_cache_sqlite: kmer_cache_sqlite
             N:    
-                default: 20
+                default: 40
             distances: distances
             threshold: 
-                default: 0.8
+                default: 0.995
+            minN:
+                default: 5
+            strict:
+                default: true
         out: [top_distances, matches]
diff --git a/wf_common.cwl b/wf_common.cwl
@@ -748,12 +748,6 @@ steps:
         nuc_fasta_name:
             default: annot.fna
     out: [nuc_fasta]
-  Generate_Annotation_Reports_gff_enhanced:
-    run: progs/produce_enhanced_gff.cwl
-    in:
-        gff: Generate_Annotation_Reports_gff/output
-        fasta: Generate_Annotation_Reports_nuc_fasta/nuc_fasta
-    out: [output]
   Generate_Annotation_Reports_prot_fasta:
     run: progs/asn2fasta.cwl
     in:
@@ -1026,9 +1020,6 @@ outputs:
   gff:
     type: File
     outputSource:  Generate_Annotation_Reports_gff/output
-  gff_enhanced:
-    type: File
-    outputSource:  Generate_Annotation_Reports_gff_enhanced/output
   gbk:
     type: File
     outputSource:  Generate_Annotation_Reports_gbk/output
@@ -1050,6 +1041,9 @@ outputs:
   proc_annot_stats: 
     type: File
     outputSource:  Validate_Annotation_proc_annot_stats/var_proc_annot_stats_xml
+  all_proc_annot_stats: 
+    type: File
+    outputSource:  Validate_Annotation_collect_annot_stats/output
   initial_asndisc_error_diag:
     type: File?
     outputSource:  Prepare_Unannotated_Sequences_asndisc_evaluate/xml_output