merenlab · ivagljiva · Apr 24, 2020 · Feb 28, 2020 · Feb 28, 2020 · Feb 28, 2020
diff --git a/.gitignore b/.gitignore
@@ -20,3 +20,5 @@ diamond-log-file.txt
 anvio/data/misc/SCG_TAXONOMY/GTDB/SCG_SEARCH_DATABASES/*.dmnd
 
 anvio/tests/sandbox/test_visualize_split_coverages/TEST_OUTDIR
+anvio/data/misc/KEGG/
+anvio/data/misc/Pfam/
diff --git a/anvio/__init__.py b/anvio/__init__.py
@@ -689,6 +689,14 @@ def get_args(parser):
                      "program with superuser privileges. If you don't have superuser privileges, then you can "
                      "use this parameter to tell anvi'o the location you wish to use to setup your database."}
                 ),
+    'kegg-data-dir': (
+            ['--kegg-data-dir'],
+            {'default': None,
+             'type': str,
+             'help': "The directory path for your KEGG setup, which will include things like \
+                      KOfam profiles and KEGG MODULE data. Anvi'o will try to use the default path\
+                      if you do not specify anything."}
+                ),
     'hide-outlier-SNVs': (
             ['--hide-outlier-SNVs'],
             {'default': False,
@@ -701,6 +709,12 @@ def get_args(parser):
                      "up using this flag) (plus, there may or may not be some historical data on this here: "
                      "https://github.com/meren/anvio/issues/309)."}
                 ),
+    'hmmer-program': (
+            ['--hmmer-program'],
+            {'type': str,
+            'required': False,
+             'help': "Which of the HMMER programs to use to run HMMs (ie, hmmscan, hmmsearch)"}
+                ),
     'hmm-source': (
             ['--hmm-source'],
             {'metavar': 'SOURCE NAME',
@@ -2265,6 +2279,41 @@ def get_args(parser):
              'help': "Provide if working with INSeq/Tn-Seq genomic data. With this, all gene level "
                      "coverage stats will be calculated using INSeq/Tn-Seq statistical methods."}
                 ),
+    'module-completion-threshold': (
+            ['--module-completion-threshold'],
+            {'default': 0.75,
+             'metavar': 'NUM',
+             'type': float,
+             'help': "This threshold defines the point at which we consider a KEGG module to be 'complete' or "
+                     "'present' in a given genome or bin. It is the fraction of steps that must be complete in "
+                     " in order for the entire module to be marked complete. The default is %(default)g."}
+                ),
+    'get-raw-data-as-json': (
+            ['--get-raw-data-as-json'],
+            {'default': None,
+            'metavar': 'FILENAME_PREFIX',
+            'type': str,
+            'help': "If you want the raw metabolism estimation data dictionary in JSON-format, provide a filename prefix to this argument."
+                    "The program will then output a file with the .json extension containing this data."}
+                ),
+    'store-json-without-estimation': (
+            ['--store-json-without-estimation'],
+            {'default': False,
+            'action': 'store_true',
+            'help': "This flag is used to control what is stored in the JSON-formatted metabolism data dictionary. When this flag is provided alongside the "
+                    "--get-raw-data-as-json flag, the JSON file will be created without running metabolism estimation, and "
+                    "that file will consequently include only information about KOfam hits and gene calls. The idea is that you can "
+                    "then modify this file as you like and re-run this program using the flag --estimate-from-json."}
+                ),
+    'estimate-from-json': (
+            ['--estimate-from-json'],
+            {'default': None,
+            'metavar': 'FILE_PATH',
+            'type': str,
+            'help': "If you have a JSON file containing KOfam hits and gene call information from your contigs database "
+                    "(such as a file produced using the --get-raw-data-as-json flag), you can provide that file to this flag "
+                    "and KEGG metabolism estimates will be computed from the information within instead of from a contigs database."}
+                ),
 }
 
 # two functions that works with the dictionary above.
@@ -2300,7 +2349,8 @@ def set_version():
            t.genes_db_version, \
            t.auxiliary_data_version, \
            t.genomes_storage_vesion, \
-           t.structure_db_version
+           t.structure_db_version, \
+           t.kegg_modules_db_version
 
 
 def get_version_tuples():
@@ -2311,7 +2361,8 @@ def get_version_tuples():
             ("Auxiliary data storage version", __auxiliary_data_version__),
             ("Pan DB version", __pan__version__),
             ("Genome data storage version", __genomes_storage_version__),
-            ("Structure DB version", __structure__version__)]
+            ("Structure DB version", __structure__version__),
+            ("Kegg Modules DB version", __kegg_modules_version__)]
 
 
 def print_version():
@@ -2322,6 +2373,7 @@ def print_version():
     run.info("Genome data storage version", __genomes_storage_version__)
     run.info("Auxiliary data storage version", __auxiliary_data_version__)
     run.info("Structure DB version", __structure__version__)
+    run.info("Kegg Modules DB version", __kegg_modules_version__)
 
 
 __version__, \
@@ -2332,7 +2384,8 @@ def print_version():
 __genes__version__, \
 __auxiliary_data_version__, \
 __genomes_storage_version__ , \
-__structure__version__ = set_version()
+__structure__version__, \
+__kegg_modules_version__ = set_version()
 
 
 if '-v' in sys.argv or '--version' in sys.argv:

diff --git a/anvio/constants.py b/anvio/constants.py
@@ -62,7 +62,7 @@
 
 max_num_items_for_hierarchical_clustering = 20000
 
-# max coverage depth to read from BAM files using pysam. 
+# max coverage depth to read from BAM files using pysam.
 # this parameter also can be set later using command line parameters
 # we use uint16 as dtype for numpy arrays when we work on & store coverages
 # which has limit of 65536, so this constant needs to be smaller than that.
@@ -163,8 +163,8 @@
                                  'Val': {"C":5,  "H":11, "N":1, "O":2, "S":0}})
 
 # taken from http://prowl.rockefeller.edu/aainfo/volume.htm
-# volume reference: A.A. Zamyatin, Protein Volume in Solution, Prog. Biophys. Mol. Biol. 24(1972)107-123. 
-# surface area reference: C. Chotia, The Nature of the Accessible and Buried Surfaces in Proteins, J. Mol. Biol., 105(1975)1-14. 
+# volume reference: A.A. Zamyatin, Protein Volume in Solution, Prog. Biophys. Mol. Biol. 24(1972)107-123.
+# surface area reference: C. Chotia, The Nature of the Accessible and Buried Surfaces in Proteins, J. Mol. Biol., 105(1975)1-14.
 AA_geometry = Counter({'Ala': {"volume":88.6,  "area":115},
                        'Arg': {"volume":173.4, "area":225},
                        'Asn': {"volume":111.1, "area":150},
@@ -369,3 +369,7 @@ def get_codon_to_num_lookup(reverse_complement=False):
 nt_to_RC_num_lookup = get_nt_to_num_lookup({'A': 3, 'C': 2, 'G': 1, 'T': 0, 'N': 4})
 codon_to_num_lookup = get_codon_to_num_lookup(reverse_complement=False)
 codon_to_RC_num_lookup = get_codon_to_num_lookup(reverse_complement=True)
+
+
+# KEGG setup constant - used to warn user that the KEGG MODULES.db data may need to be updated
+KEGG_SETUP_INTERVAL = 90 # days since last MODULES.db creation
diff --git a/anvio/db.py b/anvio/db.py
@@ -450,7 +450,7 @@ def get_table_as_dict(self, table_name, table_structure=None, string_the_key=Fal
         # entry assigns a new `entry_id`, enters the data. it is all good when there is a single process doing it.
         # but when there are multiple processes running in parallel, sometimes race conditions occur: two processes
         # learn the max entry id about the same time, and when they finally enter the data to the db, some entries
-        # end up not being unique. this is a toughie because sometimes entry ids are used to connect distinct 
+        # end up not being unique. this is a toughie because sometimes entry ids are used to connect distinct
         # information from different tables, so they must be known before the data goes into the database, etc.
         # when these race conditions occur, anvi'o gives an error telling the user kindly that they are fucked. but in
         # some cases it is possible to recover from that (THE CODE BELOW TRIES TO DO THAT) by reassigning all ids on the
@@ -593,13 +593,17 @@ def get_table_as_dataframe(self, table_name, where_clause=None, columns_of_inter
         return results_df[columns_of_interest]
 
 
-    def get_some_rows_from_table_as_dict(self, table_name, where_clause, error_if_no_data=True, string_the_key=False):
+    def get_some_rows_from_table_as_dict(self, table_name, where_clause, error_if_no_data=True, string_the_key=False, row_num_as_key=False):
         """This is similar to get_table_as_dict, but much less general.
 
            get_table_as_dict can do a lot, but it first reads all data into the memory to operate on it.
            In some cases the programmer may like to access to only a small fraction of entries in a table
            by using `WHERE column = value` notation, which is not possible with the more generalized
-           function."""
+           function.
+
+           row_num_as_key   bool    added as parameter so this function works for KEGG MODULES.db, which does not have unique IDs in the
+                                    first column. If True, the returned dictionary will be keyed by integers from 0 to (# rows returned - 1)
+        """
 
         results_dict = {}
 
@@ -608,16 +612,29 @@ def get_some_rows_from_table_as_dict(self, table_name, where_clause, error_if_no
 
         rows = self._exec('''SELECT * FROM %s WHERE %s''' % (table_name, where_clause)).fetchall()
 
+        row_num = 0
         for row in rows:
             entry = {}
 
-            for i in columns_to_return[1:]:
-                entry[table_structure[i]] = row[i]
+            if row_num_as_key:
+                entry[table_structure[0]] = row[0]
+                for i in columns_to_return[1:]:
+                    entry[table_structure[i]] = row[i]
 
-            if string_the_key:
-                results_dict[str(row[0])] = entry
+                if string_the_key:
+                    results_dict[str(row_num)] = entry
+                else:
+                    results_dict[row_num] = entry
             else:
-                results_dict[row[0]] = entry
+                for i in columns_to_return[1:]:
+                    entry[table_structure[i]] = row[i]
+
+                if string_the_key:
+                    results_dict[str(row[0])] = entry
+                else:
+                    results_dict[row[0]] = entry
+
+            row_num += 1
 
         if error_if_no_data and not len(results_dict):
             raise ConfigError("Query on %s with the where clause of '%s' did not return anything." % (table_name, where_clause))

diff --git a/anvio/dbops.py b/anvio/dbops.py
@@ -502,7 +502,7 @@ def init_functions(self, requested_sources=[], dont_panic=False):
             if gene_callers_id not in self.gene_function_calls_dict:
                 self.gene_function_calls_dict[gene_callers_id] = dict([(s, None) for s in self.gene_function_call_sources])
 
-            if self.gene_function_calls_dict[gene_callers_id][source]:
+            if self.gene_function_calls_dict[gene_callers_id][source] and e_value:
                 if self.gene_function_calls_dict[gene_callers_id][source][2] < e_value:
                     # 'what we have:', self.gene_function_calls_dict[gene_callers_id][source]
                     # 'rejected    :', ('%s :: %s' % (function if function else 'unknown', accession), e_value)
@@ -3959,7 +3959,7 @@ def create(self, args):
     def compress_nt_position_info(self, contig_length, genes_in_contig, genes_in_contigs_dict):
         """This function compresses information regarding each nucleotide position in a given contig
            into a small int. Every nucleotide position is represented by four bits depending on whether
-           they occur in a complete opoen reading frame, and which base they correspond to in a codon.
+           they occur in a complete open reading frame, and which base they correspond to in a codon.
 
                 0000
                 ||||

diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py
@@ -6,6 +6,7 @@
 import gzip
 import shutil
 from threading import Thread, Lock
+import glob
 
 import anvio
 import anvio.utils as utils
@@ -30,26 +31,54 @@
 
 
 class HMMer:
-    def __init__(self, target_files_dict, num_threads_to_use=1, progress=progress, run=run):
+    def __init__(self, target_files_dict, num_threads_to_use=1, program_to_use="hmmscan", progress=progress, run=run):
         """A class to streamline HMM runs."""
         self.num_threads_to_use = num_threads_to_use
+        self.program_to_use = program_to_use
         self.progress = progress
         self.run = run
 
         self.tmp_dirs = []
         self.target_files_dict = {}
 
+        acceptable_programs = ["hmmscan", "hmmsearch"]
+        if self.program_to_use not in acceptable_programs:
+            raise ConfigError("HMMer class here. You are attemptimg to use the program %s to run HMMs, but we don't recognize it. The currently"
+                                " supported programs are: %s" % (self.program_to_use, ", ".join(acceptable_programs)))
+
         for source in target_files_dict:
             tmp_dir = filesnpaths.get_temp_directory_path()
             self.tmp_dirs.append(tmp_dir)
 
             part_file_name = os.path.join(tmp_dir, os.path.basename(target_files_dict[source]))
 
             # create splitted fasta files inside tmp directory
-            self.target_files_dict[source] = utils.split_fasta(target_files_dict[source], 
+            self.target_files_dict[source] = utils.split_fasta(target_files_dict[source],
                                                                parts=self.num_threads_to_use,
                                                                prefix=part_file_name)
 
+    def verify_hmmpress_output(self, hmm_path):
+        """This function verifies that the HMM profiles located at hmm_path have been successfully hmmpressed.
+
+        What this means is that every .hmm profile in the directory has an associated .h3f, .h3i, .h3m, and
+        .h3p file.
+
+        PARAMETERS
+        ==========
+        hmm_path    string, the path at which the HMM profiles are located
+
+        """
+
+        for file_path in glob.glob(os.path.join(hmm_path, '*.hmm')):
+            base_path = file_path[:-3]
+            expected_extensions = ['h3f', 'h3i', 'h3m', 'h3p']
+            for ext in expected_extensions:
+                if not os.path.exists(base_path + ext):
+                    raise ConfigError("It appears that hmmpress was not properly run on the hmm profiles at %s. The \
+                                        file %s does not exist. It is likely that you will have to set up your profiles \
+                                        again by running a program such as `anvi-setup-pfams` or `anvi-setup-kegg-kofams`. \
+                                        We are very sorry about this." % (hmm_path, base_path + ext))
+
 
     def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms):
         target = ':'.join([alphabet, context])
@@ -70,40 +99,24 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode
         self.run.info('Context', context)
         self.run.info('Domain', domain if domain else 'N\\A')
         self.run.info('HMM model path', hmm)
-        self.run.info('Number of genes', num_genes_in_model)
+        self.run.info('Number of genes in HMM model', num_genes_in_model)
         self.run.info('Noise cutoff term(s)', noise_cutoff_terms)
         self.run.info('Number of CPUs will be used for search', self.num_threads_to_use)
+        if alphabet in ['DNA', 'RNA']:
+            self.run.info('HMMer program used for search', 'nhmmscan')
+        else:
+            self.run.info('HMMer program used for search', self.program_to_use)
 
-        # we want to create hmm files in the same direcotry
         tmp_dir = os.path.dirname(self.target_files_dict[target][0])
-        log_file_path = os.path.join(tmp_dir, '00_log.txt')
+        log_file_path = os.path.join(tmp_dir, '*_log')
 
         self.run.info('Temporary work dir', tmp_dir)
-        self.run.info('Log file', log_file_path)
-
-        self.progress.new('Unpacking the model into temporary work directory')
-        self.progress.update('...')
-        hmm_file_path = os.path.join(tmp_dir, source + '_hmm.txt')
-        hmm_file = open(hmm_file_path, 'wb')
-        hmm_file.write(gzip.open(hmm, 'rb').read())
-        hmm_file.close()
-        self.progress.end()
+        self.run.info('Log files', log_file_path)
 
-        self.progress.new('Processing')
-        self.progress.update('Compressing the pfam model')
-
-        cmd_line = ['hmmpress', hmm_file_path]
-        ret_val = utils.run_command(cmd_line, log_file_path)
-
-        if ret_val:
-            raise ConfigError("The last call did not work quite well. Most probably the version of HMMER you have "
-                              "installed is either not up-to-date enough, or too new :/ Just to make sure what went "
-                              "wrong please take a look at the log file ('%s'). Please visit %s to see what "
-                              "is the latest version availalbe if you think updating HMMER can resolve it. You can "
-                              "learn which version of HMMER you have on your system by typing 'hmmpress -h'."\
-                                       % (log_file_path, 'http://hmmer.janelia.org/download.html'))
-        self.progress.end()
 
+        # check if all hmmpress files are in the HMM directory
+        self.verify_hmmpress_output(hmm)
+        # we may want to throw a more descriptive error *here* instead of failing in the verify function
 
 
         workers = []
@@ -119,17 +132,28 @@ def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_mode
                             "Anvi'o will use %s process with %s cores each instead. I hope thats okay for you. " %
                              (str(self.num_threads_to_use), str(num_parts), target, str(num_parts), cores_per_process))
 
+        if alphabet in ['DNA', 'RNA'] and self.program_to_use == 'hmmsearch':
+            self.run.warning("You requested to use the program `%s`, but because you are working with %s sequences Anvi'o will use `nhmmscan` instead. "
+                            "We hope that is alright." % (self.program_to_use, alphabet))
+
 
         for part_file in self.target_files_dict[target]:
             log_file = part_file + '_log'
             output_file = part_file + '_output'
             shitty_file = part_file + '_shitty'
 
-            cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else 'hmmscan',
-                        '-o', output_file, *noise_cutoff_terms.split(),
-                        '--cpu', cores_per_process,
-                        '--tblout', shitty_file,
-                        hmm_file_path, part_file]
+            if noise_cutoff_terms:
+                cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use,
+                            '-o', output_file, *noise_cutoff_terms.split(),
+                            '--cpu', cores_per_process,
+                            '--tblout', shitty_file,
+                            hmm, part_file]
+            else: # if we didn't pass any noise cutoff terms, here we don't include them in the command line
+                cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else self.program_to_use,
+                            '-o', output_file,
+                            '--cpu', cores_per_process,
+                            '--tblout', shitty_file,
+                            hmm, part_file]
 
             t = Thread(target=self.hmmscan_worker, args=(part_file,
                                                          cmd_line,
@@ -199,5 +223,3 @@ def hmmscan_worker(self, part_file, cmd_line, shitty_output_file, log_file, merg
     def clean_tmp_dirs(self):
         for tmp_dir in self.tmp_dirs:
             shutil.rmtree(tmp_dir)
-
-