From bee13cc205e769e7931339eb4ba57db0bfc2a4f9 Mon Sep 17 00:00:00 2001
From: EvanRees <erees@wisc.edu>
Date: Tue, 5 May 2020 11:21:13 -0500
Subject: [PATCH 1/4] Add entrypoint functionality. Update docstrings. *
 resolves issue-#54 * :memo: Update docstrings missing for functions. *
 :racehorse: Add cache for properties reading assembly sequences. * :art: Add
 Entrypoint functionality for incorporation to packaging/distribution. *
 :art::bug: Update main() to handle updated functions.

---
 autometa/common/mag.py        |  57 ++++---
 autometa/common/metagenome.py | 294 +++++++++++++++++++++++-----------
 autometa/taxonomy/ncbi.py     |   2 +-
 3 files changed, 235 insertions(+), 118 deletions(-)

diff --git a/autometa/common/mag.py b/autometa/common/mag.py
index 102266e17..194294eb7 100644
--- a/autometa/common/mag.py
+++ b/autometa/common/mag.py
@@ -32,7 +32,9 @@
 import numpy as np
 
 from Bio import SeqIO
+from Bio.SeqIO.FastaIO import SimpleFastaParser
 from Bio import SeqUtils
+from functools import lru_cache
 
 from autometa.common.markers import Markers,MARKERS_DIR
 from autometa.common import kmers
@@ -63,8 +65,29 @@ def __init__(self, assembly, contigs, outdir=None):
         self.nseqs = len(self.contigs)
 
     @property
+    @lru_cache(maxsize=None)
+    def assembly_seqs(self):
+        with open(self.assembly) as fh:
+            return [seq for title,seq in SimpleFastaParser(fh)]
+
+    @property
+    @lru_cache(maxsize=None)
+    def mag_seqs(self):
+        """Retrieve sequences from assembly contained in `self.contigs`.
+
+        Returns
+        -------
+        list
+            list of SeqIO [SeqRecords, ...]
+
+        """
+        return [seq for seq in SeqIO.parse(self.assembly, 'fasta')
+            if seq.id in self.contigs]
+
+    @property
+    @lru_cache(maxsize=None)
     def totalsize(self):
-        return sum([len(rec) for rec in self.get_seqs(all=True)])
+        return sum(len(rec) for rec in self.assembly_seqs)
 
     @property
     def size_pct(self):
@@ -72,19 +95,23 @@ def size_pct(self):
 
     @property
     def nallseqs(self):
-        return len(self.get_seqs(all=True))
+        return len(self.assembly_seqs)
 
     @property
     def seqs_pct(self):
         return self.nseqs / self.nallseqs * 100
 
     @property
+    @lru_cache(maxsize=None)
     def size(self):
-        return sum(len(seq) for seq in self.get_seqs())
+        return sum(len(seq) for seq in self.mag_seqs)
 
     @property
-    def gc_content(self):
-        return np.mean([SeqUtils.GC(rec.seq) for rec in self.get_seqs()])
+    @lru_cache(maxsize=None)
+    def length_weighted_gc(self):
+        weights = [len(rec.seq)/self.size for rec in self.mag_seqs]
+        gc_content = [SeqUtils.GC(rec.seq) for rec in self.mag_seqs]
+        return np.average(a=gc_content, weights=weights)
 
     @property
     def nucl_orfs_exist(self):
@@ -143,26 +170,6 @@ def write_orfs(self, fpath, orf_type='prot'):
         records = self.get_orfs(orf_type=orf_type)
         SeqIO.write(records, fpath, 'fasta')
 
-    def get_seqs(self, all=False):
-        """Retrieve sequences from assembly.
-
-        Parameters
-        ----------
-        all : bool
-            Gets all sequences from assembly if True else sequences for MAG
-            (the default is False).
-
-        Returns
-        -------
-        list
-            list of SeqIO [SeqRecords, ...]
-
-        """
-        if all:
-            return [seq for seq in SeqIO.parse(self.assembly, 'fasta')]
-        return [seq for seq in SeqIO.parse(self.assembly, 'fasta')
-            if seq.id in self.contigs]
-
     def describe(self, autometa_details=True):
         print(f'''
 M.A.G. Details
diff --git a/autometa/common/metagenome.py b/autometa/common/metagenome.py
index 4810825b0..55cafa0be 100644
--- a/autometa/common/metagenome.py
+++ b/autometa/common/metagenome.py
@@ -25,7 +25,9 @@
 """
 
 
+import decimal
 import logging
+import numbers
 import os
 
 import numpy as np
@@ -34,6 +36,7 @@
 from Bio import SeqIO
 from Bio.SeqIO.FastaIO import SimpleFastaParser
 from Bio import SeqUtils
+from functools import lru_cache
 
 from autometa.common import kmers
 from autometa.common import coverage
@@ -91,10 +94,10 @@ class Metagenome:
         [SeqRecord,...]
     nseqs : int
         Number of sequences in assembly.
-    mean_gc : float
-        Mean GC% of assembly.
+    length_weighted_gc : float
+        Length weighted average GC% of assembly.
     size : int
-        Total assembly size.
+        Total assembly size in bp.
     largest_seq : str
         id of longest sequence in assembly
     nucls : list
@@ -136,11 +139,13 @@ def __str__(self):
         return self.assembly
 
     @property
+    @lru_cache(maxsize=None)
     def sequences(self):
         with open(self.assembly) as fh:
             return [seq for title,seq in SimpleFastaParser(fh)]
 
     @property
+    @lru_cache(maxsize=None)
     def seqrecords(self):
         return [seq for seq in SeqIO.parse(self.assembly, 'fasta')]
 
@@ -149,8 +154,11 @@ def nseqs(self):
         return len(self.sequences)
 
     @property
-    def mean_gc(self):
-        return np.mean([SeqUtils.GC(seq) for seq in self.sequences])
+    @lru_cache(maxsize=None)
+    def length_weighted_gc(self):
+        weights = [len(seq)/self.size for seq in self.sequences]
+        gc_counts = [SeqUtils.GC(seq) for seq in self.sequences]
+        return np.average(a=gc_counts, weights=weights)
 
     @property
     def size(self):
@@ -168,6 +176,7 @@ def largest_seq(self):
 
     @property
     def orfs_called(self):
+        # COMBAK: Add checkpointing checksum check here
         for fp in [self.prot_orfs_fpath, self.nucl_orfs_fpath]:
             if not os.path.exists(fp):
                 return False
@@ -176,15 +185,18 @@ def orfs_called(self):
         return True
 
     @property
+    @lru_cache(maxsize=None)
     def nucls(self):
         return self.orfs(orf_type='nucl')
 
     @property
+    @lru_cache(maxsize=None)
     def prots(self):
         return self.orfs(orf_type='prot')
 
     @property
     def taxonomy_assigned(self):
+        # COMBAK: Add checkpointing checksum check here
         if os.path.exists(self.taxonomy_fpath) and os.stat(self.taxonomy_fpath).st_size > 0:
             return True
         return False
@@ -226,7 +238,7 @@ def describe(self):
 N50: {self.fragmentation_metric():,} bp
 N10: {self.fragmentation_metric(.1):,} bp
 N90: {self.fragmentation_metric(.9):,} bp
-Mean GC: {self.mean_gc:4.2f}%
+Length Weighted Avg. GC content: {self.length_weighted_gc:4.2f}%
 Largest sequence: {self.largest_seq}
 ________________________
 Autometa Details
@@ -244,6 +256,11 @@ def describe(self):
     def length_filter(self, out, cutoff=3000):
         """Filters sequences by length with provided cutoff.
 
+        Note: A WARNING will be emitted if the length filter is applied *after*
+        the ORFs provided for the Metagenome are already called prompting the
+        user to perform orf calling again to correspond to length filtered
+        contigs.
+
         Parameters
         ----------
         cutoff : int
@@ -263,11 +280,8 @@ def length_filter(self, out, cutoff=3000):
         FileExistsError
             filepath consisting of sequences that passed filter already exists
         """
-        try:
-            cutoff = float(cutoff)
-        except Exception as err:
-            pass
-        if not type(cutoff) in [int, float]:
+        if not isinstance(cutoff, numbers.Number) or isinstance(cutoff, bool):
+            # https://stackoverflow.com/a/4187220/13118765
             raise TypeError(f'cutoff: {cutoff} must be a float or int')
         if cutoff <= 0:
             raise ValueError(f'cutoff: {cutoff} must be a positive real number')
@@ -280,7 +294,12 @@ def length_filter(self, out, cutoff=3000):
             if not os.path.exists(gunzipped_fpath):
                 gunzip(self.assembly, gunzipped_fpath)
             self.assembly = gunzipped_fpath
+        logger.info(f'Getting contigs greater than or equal to {cutoff:,} bp')
         records = [seq for seq in self.seqrecords if len(seq) >= cutoff]
+        if self.orfs_called:
+            msg = (f'{self.nucl_orfs_fpath} and {self.prot_orfs_fpath} have already been called!'
+                'Call orfs again to retrieve only ORFs corresponding to filtered assembly')
+            logger.warning(msg)
         SeqIO.write(records, out, 'fasta')
         return Metagenome(
             assembly=out,
@@ -317,13 +336,13 @@ def call_orfs(self, force=False, cpus=0, parallel=True):
         OSError
             ORF calling failed.
         """
-        if type(force) is not bool:
-            raise TypeError(f'force:({force}) must be a boolean. I.e. True|False')
-        if type(parallel) is not bool:
-            raise TypeError(f'parallel:({parallel}) must be a boolean. I.e. True|False')
-        if type(cpus) is not int:
-            raise TypeError(f'cpus:({cpus}) must be an integer')
-        # OPTIMIZE: Should not need to call ORFs on contigs below length cutoff
+        for arg, argname in zip([force, parallel],['force','parallel']):
+            if not isinstance(arg, bool) and isinstance(arg, numbers.Number):
+                raise TypeError(f'{argname} must be a boolean!')
+        if not isinstance(cpus, int) or isinstance(cpus, bool):
+            raise TypeError(f'cpus:({cpus}) must be an integer!')
+
+        # COMBAK: Add checkpointing checksum check here
         try:
             nucls_fp, prots_fp = prodigal.run(
                 assembly=self.assembly,
@@ -361,7 +380,7 @@ def orfs(self, orf_type='prot', cpus=0):
         if not self.orfs_called:
             self.call_orfs(cpus=cpus)
         if orf_type not in {'prot','nucl'}:
-            raise ValueError('orf_type must be \'prot\' or \'nucl\'!')
+            raise ValueError('orf_type must be "prot" or "nucl"!')
         orfs_fpath = self.prot_orfs_fpath if orf_type == 'prot' else self.nucl_orfs_fpath
         return [orf for orf in SeqIO.parse(orfs_fpath, 'fasta')]
 
@@ -372,14 +391,14 @@ def get_kmers(self, kmer_size=5, multiprocess=True, out=None, normalized=None,
 
         Parameters
         ----------
-        kmer_size : int
+        kmer_size : int, optional
             length of k-mer to count (the default is 5).
-        normalized : str
-            Perform Centered-log ratio normalization on counted k-mers (the default is None).
-            and write to provided `normalized` path.
-        out : str
+        normalized : str, optional
+            Perform Centered-log ratio normalization on counted k-mers and write
+            to provided `normalized` file path (the default is None).
+        out : str, optional
             Write counted k-mers to `out` (the default is None).
-        force : bool
+        force : bool, optional
             Overwrite existing k-mers `out` file (the default is False).
 
         Returns
@@ -396,7 +415,7 @@ def get_kmers(self, kmer_size=5, multiprocess=True, out=None, normalized=None,
         case2 = out_specified and out_exists and force
         case3 = out_specified and not out_exists
         if case1:
-            logger.warning(f'FileExistsError: {out} force to overwrite. [retrieving]')
+            logger.warning(f'FileExists: {out} force to overwrite. [retrieving]')
             return pd.read_csv(out, sep='\t', index_col='contig')
         normalize_kmers = True if normalized else False
         logger.info(f'Counting {kmer_size}-mers. Normalize: {normalize_kmers}')
@@ -412,10 +431,32 @@ def get_kmers(self, kmer_size=5, multiprocess=True, out=None, normalized=None,
             normalized_df = kmers.normalize(kmers_df)
             normalized_df.to_csv(normalized, sep='\t', header=True, index=True)
             return normalized_df
-        return kmers_df
+        else:
+            return kmers_df
 
     @timeit
     def get_coverages(self, out, from_spades=True, **kwargs):
+        """Retrieve contig coverages using provided `assembly` and `*_reads` or
+        if the metagenome assembly was generated from SPAdes, use the k-mer coverages
+        provided in each contig's header.
+
+        Parameters
+        ----------
+        out : str
+            </path/to/write/coverages.tsv>
+        from_spades : bool
+            Description of parameter `from_spades` (the default is True).
+        **kwargs : dict
+            May contain the following keys: 'sam', 'bam', 'lengths', 'bed'
+            Keys should correspond to their respective alignment files.
+            'lengths' is a path to a tab-delimited table of contig and its length.
+
+        Returns
+        -------
+        pd.DataFrame
+            index=contig cols=['coverage']
+
+        """
         if from_spades:
             return coverage.from_spades_names(self.seqrecords)
         return coverage.get(
@@ -438,15 +479,23 @@ def assign_taxonomy(self, method, force=False, *args, **kwargs):
         ----------
         force : bool, optional
             overwrite existing voting method's file (the default is False).
-        *args : type
+        *args : list
             Description of parameter `*args`.
-        **kwargs : type
-            Description of parameter `**kwargs`.
+        **kwargs : dict
+            May contain the following keys:
+
+            * cpus : int, num. cpus to use
+            * ncbi : str, <path/to/ncbi/databases/directory>
+            * usepickle : bool, whether to pickle taxonomy-specific files
+            * verbose : bool, Add verbosity to stream to terminal
+            * blast : str, </path/to/diamond/output/blast.tsv>
+            * hits : str, </path/to/diamond/hits.pkl.gz>
+            * force : bool, force overwrite existing results files
 
         Raises
         -------
-        ExceptionName
-            Why the exception is raised.
+        NotImplementedError
+            Provided `method` not yet implemented.
 
         """
         logger.debug(f'assigning taxonomy via {method}')
@@ -477,9 +526,17 @@ def get_kingdoms(self, **kwargs):
 
         Parameters
         ----------
-        **kwargs : dict
-            Optional additional keyword arguments to supply to assign_taxonomy
-            and supply a separate NCBI_DIR
+        **kwargs : dict, optional
+            May contain the following keys:
+
+            * cpus : int, num. cpus to use
+            * ncbi : str, <path/to/ncbi/databases/directory>
+            * usepickle : bool, whether to pickle taxonomy-specific files
+            * verbose : bool, Add verbosity to stream to terminal
+            * blast : str, </path/to/diamond/output/blast.tsv>
+            * hits : str, </path/to/diamond/hits.pkl.gz>
+            * force : bool, force overwrite existing results files
+
 
         Returns
         -------
@@ -506,6 +563,7 @@ def get_kingdoms(self, **kwargs):
                 left_on='taxid',
                 right_index=True)
             self.taxonomy.to_csv(self.taxonomy_fpath,sep='\t',index=True,header=True)
+            # COMBAK: Add checkpointing checksum check here
             logger.debug(f'Added canonical rank names to {self.taxonomy_fpath}')
         if 'superkingdom' not in self.taxonomy.columns:
             raise KeyError(f'superkingdom is not in taxonomy columns {self.taxonomy.columns}')
@@ -526,7 +584,7 @@ def write_ranks(self, rank='superkingdom'):
         Returns
         -------
         list
-            [rank_name_fp, ...]
+            [rank_name_fpath, ...]
 
         Raises
         -------
@@ -537,88 +595,140 @@ def write_ranks(self, rank='superkingdom'):
         if rank not in NCBI.CANONICAL_RANKS:
             raise ValueError(f'rank: {rank} not in {NCBI.CANONICAL_RANKS}')
         fpaths = []
-        for rank_name,dff in self.taxonomy.groupby(rank):
-            records = [r for r in SeqIO.parse(self.assembly, 'fasta') if r.id in dff.index]
+        for rank_name, dff in self.taxonomy.groupby(rank):
+            records = [r for r in self.seqrecords if r.id in dff.index]
             rank_name = rank_name.replace(' ','_')
             rank_name_fname = '.'.join([rank_name.title(),'fna'])
-            rank_name_fp = os.path.join(self.outdir, rank_name_fname)
+            rank_name_fpath = os.path.join(self.outdir, rank_name_fname)
             if not records:
-                logger.warning(f'No records to write to {rank_name_fp}')
+                logger.warning(f'No records to write to {rank_name_fpath}')
             else:
-                n_written = SeqIO.write(records, rank_name_fp, 'fasta')
-                logger.debug(f'Wrote {n_written} records to {rank_name_fp}')
-                fpaths.append(rank_name_fp)
+                n_written = SeqIO.write(records, rank_name_fpath, 'fasta')
+                logger.debug(f'Wrote {n_written} records to {rank_name_fpath}')
+                fpaths.append(rank_name_fpath)
         return fpaths
 
 def main():
     import argparse
     import logging as logger
     logger.basicConfig(
-        format='%(asctime)s : %(name)s : %(levelname)s : %(message)s',
+        format='[%(asctime)s %(levelname)s] %(name)s: %(message)s',
         datefmt='%m/%d/%Y %I:%M:%S %p',
         level=logger.DEBUG)
+
     parser = argparse.ArgumentParser(description="""
-    Metagenome class holding attributes and methods to manipulate metagenome assemblies.
-    """)
-    parser.add_argument('assembly', help='</path/to/assembly.fasta>')
-    parser.add_argument('--ncbi', help='</path/to/ncbi/dir>', required=True)
-    parser.add_argument('--cutoff', help='length to filter sequences',default=3000,
-        type=int)
-    parser.add_argument('--kmer-size', default=5, type=int)
-    parser.add_argument(
-        '--kmer-normalized',
-        help='Perform CLR transform on k-mer frequencies if provided. (</path/to/kmers.normalized.tsv>)',
-    )
-    parser.add_argument(
-        '--kmer-fpath',
-        help='</path/to/kmers.tsv>',
-    )
+    This handles filtering by length and taxon as well as ORF calling,
+    and various metagenome statistics.
+    """,
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('assembly', help='Path to metagenome assembly (nucleotide fasta).')
+    parser.add_argument('outdir', help='Output directory to store annotations.')
+    parser.add_argument('nucls',
+        help='Path to nucleotide ORFs corresponding to `assembly.` '
+        '(Will write to path if ORFs do not exist).',
+        type=str)
+    parser.add_argument('prots',
+        help='Path to amino acid ORFs corresponding to `assembly.` '
+        '(Will write to path if ORFs do not exist).',
+        type=str)
+    parser.add_argument('--filter',
+        help='Filter to apply to metagenome. You may supply multiple filters.',
+        choices=['length','taxon'], nargs='*')
+    parser.add_argument('--cutoff', help='Cutoff to apply to length filter',
+        default=3000,
+        type=int,
+        metavar='<int>')
+    parser.add_argument('--length-fname',
+        help='Filename to assign fasta after applying length filter.',
+        default='`assembly`.filtered.fna')
     parser.add_argument('--taxon-method', default='majority_vote',
         choices=['majority_vote'])
-    parser.add_argument('--vote-fname', help='<taxon-vote.tsv filename>',
+    parser.add_argument('--taxon-fname',
+        help='Filename to assign voted taxonomy annotation.',
         default='taxonomy_vote.tsv')
+    parser.add_argument('--ncbi',
+        help='Path to NCBI databases directory.',
+        default=NCBI_DIR)
+    parser.add_argument('--pickle', help='Whether to serialize taxonomy-specific files',
+        action='store_true', default=False)
+    parser.add_argument('--blast',
+        help='Path to diamond blast results.tsv. (outfmt=6) '
+        '(Will write to path if it does not exist).',
+        default=None)
+    parser.add_argument('--hits',
+        help='Path to diamond blast hits.pkl.gz. '
+        '(Will write to path if it does not exist).',
+        default=None)
     # Eventually will need to create a subparser for the taxon assignment methods
     # to include help information and required parameters according to method.
-    parser.add_argument('--cpus', help='num cpus to use', default=0)
-    parser.add_argument('--noparallel', help="Do not use GNU parallel",
-        action='store_false', default=True)
-    parser.add_argument('--force', help="overwrite existing files",
+    parser.add_argument('--cpus', help='Number of cpus to use when performing '
+        'annotations (Default will use all possible if `--parallel` is supplied).',
+        default=0, type=int)
+    parser.add_argument('--parallel', help='Use GNU parallel when performing annotations.',
         action='store_true', default=False)
-    parser.add_argument('--verbose', help="add verbosity",
+    parser.add_argument('--force', help="Overwrite existing files",
         action='store_true', default=False)
+    parser.add_argument('--verbose', help="Log more information to terminal.",
+        action='store_true', default=False)
+    parser.add_argument('--stats', help="Print various metagenome assembly statistics.",
+        action='store_true', default=False)
+
     args = parser.parse_args()
-    raw_mags = Metagenome(args.assembly)
-    mags = raw_mags.length_filter(cutoff=args.cutoff)
-    logger.info(f'{args.cutoff:,}bp length filter {raw_mags.nseqs:,} to {mags.nseqs:,} seqs')
-    logger.info(f'CallingORFs: force:{args.force}. cpus:{args.cpus} parallel:{args.noparallel}')
+
+    taxonomy_fpath = os.path.join(args.outdir, args.taxon_fname)
+    raw_mg = Metagenome(
+        assembly=args.assembly,
+        outdir=args.outdir,
+        prot_orfs_fpath=args.prots,
+        nucl_orfs_fpath=args.nucls,
+        taxonomy_fpath=taxonomy_fpath,
+        taxon_method=args.taxon_method)
+
+    if args.filter and 'length' in args.filter:
+        if args.length_fname != '`assembly`.filtered.fna':
+            length_filtered_fpath = os.path.join(raw_mg.outdir, args.length_fname)
+        else:
+            basename = os.path.basename(raw_mg.assembly)
+            fname = os.path.splitext(basename)[0]
+            length_fname = f'{fname}.filtered.fna'
+            length_filtered_fpath = os.path.join(raw_mg.outdir, length_fname)
+        try:
+            mg = raw_mg.length_filter(out=length_filtered_fpath, cutoff=args.cutoff)
+        except FileExistsError as err:
+            logger.warning(f'FileExistsError: {length_filtered_fpath}. Skipping...')
+            mg = Metagenome(
+                assembly=length_filtered_fpath,
+                outdir=raw_mg.outdir,
+                nucl_orfs_fpath=raw_mg.nucl_orfs_fpath,
+                prot_orfs_fpath=raw_mg.prot_orfs_fpath,
+                taxonomy_fpath=raw_mg.taxonomy_fpath,
+                taxon_method=raw_mg.taxon_method)
+    else:
+        mg = raw_mg
+
+    if args.stats:
+        mg.describe()
+
     try:
-        mags.call_orfs(
+        mg.call_orfs(
             force=args.force,
-            verbose=args.verbose,
             cpus=args.cpus,
-            parallel=args.noparallel,
+            parallel=args.parallel,
         )
     except FileExistsError as err:
-        logger.warning(f'FileExistsError: {mags.prots_out}. Skipping...')
-    logger.info(f'TaxonAssignment - method:{args.taxon_method}')
-    mags.get_kingdoms(
-        method=args.taxon_method,
-        fasta=mags.prots_out,
-        ncbi_dir=args.ncbi,
-        outdir=mags.dirname,
-        blast=None,
-        usepickle=True,
-        verbose=args.verbose,
-    )
-    logger.info(f'Getting k-mers of size {args.kmer_size}. Normalize: {args.kmer_normalized}')
-    kmer_fpath = args.kmer_fpath if args.kmer_fpath else os.path.join(mags.dirname, 'kmers.tsv')
-    kmers_df = mags.get_kmers(
-        kmer_size=args.kmer_size,
-        normalized=args.kmer_normalized,
-        out=kmer_fpath,
-        force=args.force,
-    )
-    logger.info('Done')
+        logger.warning(f'FileExistsError: {mg.prots_out}. Skipping...')
+
+    if args.filter and 'taxon' in args.filter:
+        logger.info(f'TaxonAssignment - method:{args.taxon_method}')
+        mg.get_kingdoms(
+            cpus=args.cpus,
+            ncbi=args.ncbi,
+            usepickle=args.pickle,
+            verbose=args.verbose,
+            blast=args.blast,
+            hits=args.hits,
+            force=args.force,
+        )
 
 if __name__ == '__main__':
     main()
diff --git a/autometa/taxonomy/ncbi.py b/autometa/taxonomy/ncbi.py
index a9f577f32..8bb886d24 100644
--- a/autometa/taxonomy/ncbi.py
+++ b/autometa/taxonomy/ncbi.py
@@ -41,7 +41,7 @@
 
 logger = logging.getLogger(__name__)
 
-BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+BASE_DIR = os.path.dirname(os.path.dirname(__file__))
 NCBI_DIR = os.path.join(BASE_DIR,'databases','ncbi')
 
 class NCBI:

From b06178bb5a8ce784fb9ba844b33afbe678ed30d0 Mon Sep 17 00:00:00 2001
From: EvanRees <erees@wisc.edu>
Date: Tue, 5 May 2020 17:22:43 -0500
Subject: [PATCH 2/4] Update MAG class to MetaBin. * :memo: Add docstrings to
 methods and class * :art::fire: Remove split_nucleotides method. * :art:
 Update get_binning function to handle coverages as filepath not pd.DataFrame
 * :art: Change mag.py to metabin.py * :racehorse: fixed from PR-#66. Add
 cache functionality to time-consuming property methods. * :memo: Add COMBAK
 comment for checkpointing. Return to this when implemented in utilities.py

---
 autometa/common/{mag.py => metabin.py} | 276 ++++++++++++++++++++-----
 autometa/common/metagenome.py          |   6 +-
 autometa/config/user.py                |   2 +-
 3 files changed, 230 insertions(+), 54 deletions(-)
 rename autometa/common/{mag.py => metabin.py} (55%)

diff --git a/autometa/common/mag.py b/autometa/common/metabin.py
similarity index 55%
rename from autometa/common/mag.py
rename to autometa/common/metabin.py
index 194294eb7..dc8f291bf 100644
--- a/autometa/common/mag.py
+++ b/autometa/common/metabin.py
@@ -46,17 +46,44 @@
 logger = logging.getLogger(__name__)
 
 
-class MAG:
-    """docstring for Autometa MAG class."""
+class MetaBin:
+    """Autometa MetaBin class to manipulate/annotate contigs from a metagenome.
+
+    Parameters
+    ----------
+    assembly : str
+        </path/to/metagenome.fasta>
+    contigs : list
+        List of contigs to manipulate/annotate (must be contained in metagenome).
+    outdir : str, optional
+        </path/to/output/directory> (Default is the directory storing the `assembly`).
+
+    Attributes
+    ----------
+    basename : str
+        base name of `assembly`
+    root : str
+        root name of `assembly` (Will remove common extension like '.fasta')
+    nucls_fname : str
+        File name of `contigs` nucleotide ORFs
+    prots_fname : str
+        File name of `contigs` amino-acid ORFs
+    nucl_orfs_fpath : str
+        </path/to/nucleotide/`outdir`/`nucls_fname`>
+    prot_orfs_fpath : str
+        </path/to/amino/acid/`outdir`/`prots_fname`>
+    nseqs : int
+        Number of contigs in MetaBin
+
+    """
 
     def __init__(self, assembly, contigs, outdir=None):
         self.assembly = os.path.realpath(assembly)
-        self.outdir = os.path.realpath(outdir) if outdir else os.path.dirname(self.assembly)
         self.basename = os.path.basename(self.assembly)
-        self.assembly_name = self.basename.split('.')[0]
+        self.root = os.path.splitext(self.basename)[0]
+        self.outdir = os.path.realpath(outdir) if outdir else os.path.dirname(self.assembly)
         nucls_ext = 'orfs.fna'
         prots_ext = 'orfs.faa'
-        self.root = os.path.splitext(self.basename)[0]
         self.nucls_fname = '.'.join([self.root, nucls_ext])
         self.prots_fname = '.'.join([self.root, prots_ext])
         self.nucl_orfs_fpath = os.path.join(self.outdir, self.nucls_fname)
@@ -67,13 +94,21 @@ def __init__(self, assembly, contigs, outdir=None):
     @property
     @lru_cache(maxsize=None)
     def assembly_seqs(self):
+        """Retrieve all of the sequences from the provided `assembly`.
+
+        Returns
+        -------
+        list
+            list of str [seq, ...]
+
+        """
         with open(self.assembly) as fh:
             return [seq for title,seq in SimpleFastaParser(fh)]
 
     @property
     @lru_cache(maxsize=None)
-    def mag_seqs(self):
-        """Retrieve sequences from assembly contained in `self.contigs`.
+    def seqrecords(self):
+        """Retrieve seqrecords from assembly contained in `self.contigs`.
 
         Returns
         -------
@@ -87,59 +122,155 @@ def mag_seqs(self):
     @property
     @lru_cache(maxsize=None)
     def totalsize(self):
+        """Get the total assembly size in bp.
+
+        Returns
+        -------
+        int
+            Sum of sequence lengths in `assembly`
+
+        """
         return sum(len(rec) for rec in self.assembly_seqs)
 
     @property
     def size_pct(self):
+        """Get the size of the `contigs` in reference to the `assembly` size.
+        This is expressed as a percentage
+
+        Returns
+        -------
+        float
+            Size percentage of `contigs` covering the total `assembly` size.
+
+        """
         return self.size / self.totalsize * 100
 
     @property
     def nallseqs(self):
+        """Get the total number of sequences in `assembly`.
+
+        Returns
+        -------
+        int
+            Number of sequences contained in `assembly`
+
+        """
         return len(self.assembly_seqs)
 
     @property
     def seqs_pct(self):
+        """Get the percentage of the number of `contigs` compared to the number
+        of sequences in the `assembly`.
+
+        Returns
+        -------
+        float
+            percentage of `contigs` corresponding to num. seqs. in `assembly`.
+
+        """
         return self.nseqs / self.nallseqs * 100
 
     @property
     @lru_cache(maxsize=None)
     def size(self):
-        return sum(len(seq) for seq in self.mag_seqs)
+        """Get the summation of the lengths of `contigs` in bp.
+
+        Returns
+        -------
+        int
+            Summation of lengths of `contigs` in bp.
+
+        """
+        return sum(len(seq) for seq in self.seqrecords)
 
     @property
     @lru_cache(maxsize=None)
     def length_weighted_gc(self):
-        weights = [len(rec.seq)/self.size for rec in self.mag_seqs]
-        gc_content = [SeqUtils.GC(rec.seq) for rec in self.mag_seqs]
+        """Get the length weighted GC content of `contigs`.
+
+        Returns
+        -------
+        float
+            GC percentage weighted by contig length.
+
+        """
+        weights = [len(rec.seq)/self.size for rec in self.seqrecords]
+        gc_content = [SeqUtils.GC(rec.seq) for rec in self.seqrecords]
         return np.average(a=gc_content, weights=weights)
 
     @property
     def nucl_orfs_exist(self):
+        """Determine whether `nucl_orfs_fpath` exists.
+
+        Returns
+        -------
+        bool
+            True is returned if `nucl_orfs_fpath` is a valid path, otherwise False.
+
+        """
         return self.prepared(self.nucl_orfs_fpath)
 
     @property
     def prot_orfs_exist(self):
+        """Determine whether `prot_orfs_fpath` exists.
+
+        Returns
+        -------
+        bool
+            True is returned if `prot_orfs_fpath` is a valid path, otherwise False.
+
+        """
         return self.prepared(self.prot_orfs_fpath)
 
     @property
     def nnucls(self):
+        """Get the number of nucleotide ORFs in `nucl_orfs_fpath`.
+
+        Returns
+        -------
+        int
+            Number of nucleotide ORFs in `nucl_orfs_fpath`.
+
+        """
         if not self.nucl_orfs_exist:
             return np.nan
         return len(self.get_orfs('nucl'))
 
     @property
     def nprots(self):
+        """Get the number of amino-acid ORFs in `prot_orfs_fpath`.
+
+        Returns
+        -------
+        int
+            Number of amino-acid ORFs in `prot_orfs_fpath`.
+
+        """
         if not self.prot_orfs_exist:
             return np.nan
         return len(self.get_orfs('prot'))
 
     def prepared(self, fpath):
+        """Check whether `fpath` exists and is valid via checksum in checkpoints.
+
+        Parameters
+        ----------
+        fpath : str
+            </path/to/file>
+
+        Returns
+        -------
+        bool
+            Whether provided `fpath` has been generated and is valid
+
+        """
+        # COMBAK: Add checkpointing checksum check here
         if os.path.exists(fpath) and os.stat(fpath).st_size > 0:
             return True
         return False
 
     def get_orfs(self, orf_type='prot'):
-        """Retrieve ORFs corresponding to MAG.
+        """Retrieve ORFs corresponding to MetaBin.
 
         Parameters
         ----------
@@ -154,12 +285,17 @@ def get_orfs(self, orf_type='prot'):
 
         Raises
         -------
+        KeyError
+            `orf_type` not in ['prot','nucl']
         FileNotFoundError
-            Why the exception is raised.
+            Either `self.prot_orfs_fpath` or `self.nucl_orfs_fpath` not found.
+            This will correspond to `orf_type`
 
         """
         orfs_fpaths = {'prot':self.prot_orfs_fpath, 'nucl':self.nucl_orfs_fpath}
-        orfs_fpath = orfs_fpaths.get(orf_type, 'prot')
+        if orf_type not in orfs_fpaths:
+            raise KeyError(f'{orf_type} not in {orfs_fpaths}')
+        orfs_fpath = orfs_fpaths[orf_type]
         if not self.prepared(orfs_fpath):
             raise FileNotFoundError(orfs_fpath)
         return prodigal.orf_records_from_contigs(
@@ -167,37 +303,55 @@ def get_orfs(self, orf_type='prot'):
             fpath=self.prot_orfs_fpath)
 
     def write_orfs(self, fpath, orf_type='prot'):
+        """Write `orf_type` ORFs from `contigs` to `fpath`.
+
+        Parameters
+        ----------
+        fpath : str
+            </path/to/write/`orf_type`.orfs.fasta
+        orf_type : str
+            type of ORFs to write, choices=['prot','nucl']
+
+        """
         records = self.get_orfs(orf_type=orf_type)
         SeqIO.write(records, fpath, 'fasta')
 
     def describe(self, autometa_details=True):
-        print(f'''
-M.A.G. Details
-________________________
-Num. Contigs: {self.nseqs:,} / {self.nallseqs:,} ({self.seqs_pct:4.2f}% of total metagenome)
-Num. Nucl. ORFs: {self.nnucls:,}
-Num. Prot. ORFs: {self.nprots:,}
-Size: {self.size:,}bp / {self.totalsize:,}bp ({self.size_pct:4.2f}% of total metagenome)
-Mean GC content: {self.gc_content:4.2f}%
-________________________
-''')
+        """Describe various statistics of the MetaBin instance.
+
+        Parameters
+        ----------
+        autometa_details : bool
+            If True, will output autometa related information (the default is True).
+
+        """
+
+        print('Metabin Details\n'
+            '________________________\n'
+            f'Num. Contigs: {self.nseqs:,} / {self.nallseqs:,} ({self.seqs_pct:4.2f}% of total metagenome)\n'
+            f'Num. Nucl. ORFs: {self.nnucls:,}\n'
+            f'Num. Prot. ORFs: {self.nprots:,}\n'
+            f'Size: {self.size:,}bp / {self.totalsize:,}bp ({self.size_pct:4.2f}% of total metagenome)\n'
+            f'Mean GC content: {self.length_weighted_gc:4.2f}%\n'
+            '________________________\n'
+            )
         if autometa_details:
-            print(f'''
-            Autometa Details
-            ________________________
-            Metagenome: {self.assembly}
-            Nucl. ORFs filepath: {self.nucl_orfs_fpath}
-            Prot. ORFs filepath: {self.prot_orfs_fpath}
-            Nucl. ORFs called: {self.nucl_orfs_exist}
-            Prot. ORFs called: {self.prot_orfs_exist}''')
-
-    def split_nucleotides(self, ):
-        raise NotImplementedError()
+            print(
+            'Autometa Details\n'
+            '________________________\n'
+            f'Metagenome: {self.assembly}\n'
+            f'Nucl. ORFs filepath: {self.nucl_orfs_fpath}\n'
+            f'Prot. ORFs filepath: {self.prot_orfs_fpath}\n'
+            f'Nucl. ORFs called: {self.nucl_orfs_exist}\n'
+            f'Prot. ORFs called: {self.prot_orfs_exist}\n')
 
     @timeit
     def get_binning(self, method='recursive_dbscan', **kwargs):
         """Retrieve binning results from provided `method`.
 
+        Note: Most required arguments should be provided in `kwargs`, this is
+        currently done to allow easy addition of other binning methods via `method` arg.
+
         Parameters
         ----------
         method : str
@@ -206,6 +360,18 @@ def get_binning(self, method='recursive_dbscan', **kwargs):
             Additional keyword arguments to be passed in respective to the provided
             `method`
 
+            * kmers : str, </path/to/kmers.tsv>
+            * embedded : str, </path/to/kmers.embedded.tsv>
+            * do_pca : bool, Perform PCA prior to embedding
+            * embedding_method : str, Embedding method to use choices=['sksne','bhsne','umap']
+            * perplexity : float, perplexity setting for embedding method sksne/bhsne
+            * coverage : str, </path/to/coverages.tsv>
+            * taxonomy : str, </path/to/taxonomy.tsv>
+            * domain : str, kindom to bin choices=['bacteria','archaea']
+            * purity : float, purity cutoff to apply to bins
+            * completeness : float, completeness cutoff to apply to bins
+            * reverse : bool, If true will bin taxa from least to most specific rank
+
         Returns
         -------
         pd.DataFrame
@@ -213,8 +379,8 @@ def get_binning(self, method='recursive_dbscan', **kwargs):
 
         Raises
         -------
-        ExceptionName
-            Why the exception is raised.
+        NotImplementedError
+            Provided `method` is not yet implemented.
 
         """
         if method == 'recursive_dbscan':
@@ -224,22 +390,24 @@ def get_binning(self, method='recursive_dbscan', **kwargs):
                     embedded=kwargs.get('embedded'),
                     do_pca=kwargs.get('do_pca',True),
                     pca_dimensions=kwargs.get('pca_dims',50),
-                    method=kwargs.get('embedding_method','UMAP'),
+                    method=kwargs.get('embedding_method','bhsne'),
                     perplexity=kwargs.get('perplexity',30),
                 )
             except ValueError as error:
                 logger.exception(error)
             master_df = embedded_df
-            coverage_df = kwargs.get('coverage')
-            if coverage_df is not None or not coverage_df.empty:
+            coverage_fp = kwargs.get('coverage')
+            if coverage_fp:
+                coverage_df = pd.read_csv(coverage_fp, sep='\t', index_col='contig')
                 master_df = pd.merge(
                     master_df,
                     coverage_df,
                     how='left',
                     left_index=True,
                     right_index=True)
-            if kwargs.get('taxonomy'):
-                taxa_df = pd.read_csv(kwargs.get('taxonomy'), sep='\t', index_col='contig')
+            taxonomy_fp = kwargs.get('taxonomy')
+            if taxonomy_fp:
+                taxa_df = pd.read_csv(taxonomy_fp, sep='\t', index_col='contig')
                 master_df = pd.merge(
                     left=master_df,
                     right=taxa_df,
@@ -283,11 +451,6 @@ def markers(self, kingdom='bacteria', dbdir=MARKERS_DIR, force=False):
         pd.DataFrame
             wide format - index_col='contig', columns=[PFAM,...]
 
-        Raises
-        -------
-        ExceptionName
-            Why the exception is raised.
-
         """
 
         logger.debug(f'Retrieving markers for {kingdom} kingdom')
@@ -298,6 +461,20 @@ def markers(self, kingdom='bacteria', dbdir=MARKERS_DIR, force=False):
         return markers.get()
 
     def subset_df(self, df):
+        """Retrieve subset of provided `df` containing only `contigs`.
+
+        Parameters
+        ----------
+        df : pd.DataFrame,pd.Series,str
+            `df` may be a pandas Series or DataFrame, or a path to file.
+            If a path is provided, `df` will be read as a tab-delimited table.
+
+        Returns
+        -------
+        pd.DataFrame
+            index='contig', cols=cols in provided `df`, subset by `contigs`.
+
+        """
         if type(df) not in [pd.DataFrame, pd.Series]:
             raise TypeError(f'Unable to subset df. {type(df)} is not Series or DataFrame')
         if type(df) is pd.DataFrame:
@@ -313,15 +490,15 @@ def main():
         format='%(asctime)s : %(name)s : %(levelname)s : %(message)s',
         datefmt='%m/%d/%Y %I:%M:%S %p',
         level=logger.DEBUG)
-    parser = argparse.ArgumentParser(description='Autometa MAG Class')
+    parser = argparse.ArgumentParser(description='Autometa MetaBin Class')
     parser.add_argument('--assembly', help='</path/to/metagenome.fasta>', required=True)
-    parser.add_argument('--contigs', help='list of contigs in MAG',nargs='+', required=True)
+    parser.add_argument('--contigs', help='list of contigs in MetaBin',nargs='+', required=True)
     parser.add_argument('--domain', help='kingdom to use for binning', default='bacteria')
     parser.add_argument('--kmers', help='</path/to/kmers.tsv')
     parser.add_argument('--taxonomy', help='</path/to/taxonomy_vote.tsv')
     parser.add_argument('--coverage', help='</path/to/coverages.tsv')
     args = parser.parse_args()
-    mag = MAG(args.assembly, args.contigs)
+    mag = MetaBin(args.assembly, args.contigs)
 
     mag.get_binning(
         method='recursive_dbscan',
@@ -329,7 +506,6 @@ def main():
         domain=args.domain,
         taxonomy=args.taxonomy,
         coverage=args.coverage)
-    # mag.split_nucleotides()
 
 if __name__ == '__main__':
     main()
diff --git a/autometa/common/metagenome.py b/autometa/common/metagenome.py
index 55cafa0be..1216c5ba5 100644
--- a/autometa/common/metagenome.py
+++ b/autometa/common/metagenome.py
@@ -41,7 +41,7 @@
 from autometa.common import kmers
 from autometa.common import coverage
 from autometa.common.external import prodigal
-from autometa.common.mag import MAG
+from autometa.common.metabin import MetaBin
 from autometa.common.utilities import timeit
 from autometa.common.utilities import gunzip
 from autometa.taxonomy.majority_vote import majority_vote
@@ -541,7 +541,7 @@ def get_kingdoms(self, **kwargs):
         Returns
         -------
         dict
-            {'bacteria':MAG, ...}
+            {'bacteria':MetaBin, ...}
 
         Raises
         -------
@@ -570,7 +570,7 @@ def get_kingdoms(self, **kwargs):
         kingdoms = dict(list(self.taxonomy.groupby('superkingdom')))
         bins = {}
         for kingdom, df in kingdoms.items():
-            bins.update({kingdom:MAG(self.assembly, df.index.tolist())})
+            bins.update({kingdom:MetaBin(self.assembly, df.index.tolist())})
         return bins
 
     def write_ranks(self, rank='superkingdom'):
diff --git a/autometa/config/user.py b/autometa/config/user.py
index b9ad46e61..fc9be513f 100644
--- a/autometa/config/user.py
+++ b/autometa/config/user.py
@@ -290,7 +290,7 @@ def run_binning(self, mgargs):
             do_pca=mgargs.parameters.do_pca,
             pca_dims=mgargs.parameters.pca_dims,
             embedding_method=mgargs.parameters.embedding_method,
-            coverage=coverages,
+            coverage=mgargs.files.coverages,
             domain=mgargs.parameters.kingdom,
             taxonomy=mgargs.files.taxonomy,
             reverse=mgargs.parameters.reversed,

From 6a49307286373e359795a79ad7138012293cffbf Mon Sep 17 00:00:00 2001
From: EvanRees <erees@wisc.edu>
Date: Wed, 13 May 2020 16:25:18 -0500
Subject: [PATCH 3/4] :memo: Add docstrings to class properties. * :memo::art:
 Add defaults formatter_class to template.py

---
 autometa/common/metagenome.py | 182 +++++++++++++++++++++++++++-------
 docs/template.py              |   7 +-
 2 files changed, 153 insertions(+), 36 deletions(-)

diff --git a/autometa/common/metagenome.py b/autometa/common/metagenome.py
index 55cafa0be..396cab5de 100644
--- a/autometa/common/metagenome.py
+++ b/autometa/common/metagenome.py
@@ -107,15 +107,16 @@ class Metagenome:
 
     Methods
     ----------
-    - self.fragmentation_metric()
-    - self.describe()
-    - self.length_filter()
-    - self.call_orfs()
-    - self.orfs()
-    - self.get_kmers()
-    - self.assign_taxonomy()
-    - self.get_kingdoms()
-    - self.write_ranks()
+    * self.fragmentation_metric()
+    * self.describe()
+    * self.length_filter()
+    * self.call_orfs()
+    * self.orfs()
+    * self.get_kmers()
+    * self.assign_taxonomy()
+    * self.get_kingdoms()
+    * self.write_ranks()
+
     """
     def __init__(self, assembly, outdir, nucl_orfs_fpath, prot_orfs_fpath,
         taxonomy_fpath, taxon_method='majority_vote', fwd_reads=None,
@@ -141,31 +142,79 @@ def __str__(self):
     @property
     @lru_cache(maxsize=None)
     def sequences(self):
+        """Retrieve the sequences from provided `assembly`.
+
+        Returns
+        -------
+        list
+            [seq, seq, ...]
+
+        """
         with open(self.assembly) as fh:
             return [seq for title,seq in SimpleFastaParser(fh)]
 
     @property
     @lru_cache(maxsize=None)
     def seqrecords(self):
+        """Retrieve SeqRecord objects from provided `assembly`.
+
+        Returns
+        -------
+        list
+            [SeqRecord, SeqRecord, ...]
+
+        """
         return [seq for seq in SeqIO.parse(self.assembly, 'fasta')]
 
     @property
     def nseqs(self):
+        """Retrieve the number of sequences in provided `assembly`.
+
+        Returns
+        -------
+        int
+            Number of sequences parsed from `assembly`
+
+        """
         return len(self.sequences)
 
     @property
     @lru_cache(maxsize=None)
     def length_weighted_gc(self):
+        """Retrieve the length weighted average GC percentage of provided `assembly`.
+
+        Returns
+        -------
+        float
+            GC percentage weighted by contig length.
+
+        """
         weights = [len(seq)/self.size for seq in self.sequences]
         gc_counts = [SeqUtils.GC(seq) for seq in self.sequences]
         return np.average(a=gc_counts, weights=weights)
 
     @property
     def size(self):
+        """Retrieve the summation of sizes for each contig in the provided `assembly`.
+
+        Returns
+        -------
+        int
+            Total summation of contig sizes in `assembly`
+
+        """
         return sum(len(seq) for seq in self.sequences)
 
     @property
     def largest_seq(self):
+        """Retrieve the name of the largest sequence in the provided `assembly`.
+
+        Returns
+        -------
+        str
+            record ID of the largest sequence in `assembly`.
+
+        """
         max = float('-inf')
         largest = None
         for rec in self.seqrecords:
@@ -176,28 +225,72 @@ def largest_seq(self):
 
     @property
     def orfs_called(self):
+        """Retrieve whether `prot_orfs_fpath` and `nucl_orfs_fpath` have been called.
+
+        Note: This will check whether the aforementioned paths exist and are not empty.
+        In the future, a checksum comparison will be performed to ensure file integrity.
+
+        Returns
+        -------
+        bool
+            Description of returned object.
+
+        """
         # COMBAK: Add checkpointing checksum check here
         for fp in [self.prot_orfs_fpath, self.nucl_orfs_fpath]:
             if not os.path.exists(fp):
                 return False
-            elif not os.stat(fp).st_size > 0:
+            elif not os.path.getsize(fp) > 0:
                 return False
         return True
 
     @property
     @lru_cache(maxsize=None)
     def nucls(self):
+        """Retrieve `assembly` nucleotide ORFs.
+
+        Returns
+        -------
+        list
+            [SeqRecord, SeqRecord, ...]
+
+        """
         return self.orfs(orf_type='nucl')
 
     @property
     @lru_cache(maxsize=None)
     def prots(self):
+        """Retrieve `assembly` amino-acid ORFs.
+
+        Returns
+        -------
+        list
+            [SeqRecord, SeqRecord, ...]
+
+        """
         return self.orfs(orf_type='prot')
 
     @property
     def taxonomy_assigned(self):
+        """Retrieve whether taxonomy has been assigned to `assembly`. This will
+        check whether `taxonomy_fpath` exists and is non-empty.
+
+        Note: In the future, a checksum comparison should be performed to ensure
+        file integrity.
+
+        Returns
+        -------
+        type
+            Description of returned object.
+
+        Raises
+        -------
+        ExceptionName
+            Why the exception is raised.
+
+        """
         # COMBAK: Add checkpointing checksum check here
-        if os.path.exists(self.taxonomy_fpath) and os.stat(self.taxonomy_fpath).st_size > 0:
+        if os.path.exists(self.taxonomy_fpath) and os.path.getsize(self.taxonomy_fpath) > 0:
             return True
         return False
 
@@ -228,29 +321,44 @@ def fragmentation_metric(self, quality_measure=.50):
             if sum(lengths) > target_size:
                 return length
 
-    def describe(self):
-        print(f"""
-Metagenome Details
-________________________
-Assembly: {self.assembly}
-Num. Sequences: {self.nseqs:,}
-Size: {self.size:,} bp
-N50: {self.fragmentation_metric():,} bp
-N10: {self.fragmentation_metric(.1):,} bp
-N90: {self.fragmentation_metric(.9):,} bp
-Length Weighted Avg. GC content: {self.length_weighted_gc:4.2f}%
-Largest sequence: {self.largest_seq}
-________________________
-Autometa Details
-________________________
-Outdir: {self.outdir}
-ORFs called: {self.orfs_called}
-Prots filepath: {self.prot_orfs_fpath}
-Nucl filepath: {self.nucl_orfs_fpath}
-Taxonomy method: {self.taxon_method}
-Taxonomy assigned: {self.taxonomy_assigned}
-Taxonomy filepath: {self.taxonomy_fpath}
-""")
+    def describe(self, autometa_details=True):
+        """Print `assembly` details.
+
+        Parameters
+        ----------
+        autometa_details : bool
+            Also log Autometa specific information to the terminal (Default is True).
+
+        Returns
+        -------
+        NoneType
+
+        """
+        print('Metagenome Details\n'
+            '________________________\n'
+            f'Assembly: {self.assembly}\n'
+            f'Num. Sequences: {self.nseqs:,}\n'
+            f'Size: {self.size:,} bp\n'
+            f'N50: {self.fragmentation_metric():,} bp\n'
+            f'N10: {self.fragmentation_metric(.1):,} bp\n'
+            f'N90: {self.fragmentation_metric(.9):,} bp\n'
+            f'Length Weighted Avg. GC content: {self.length_weighted_gc:4.2f}%\n'
+            f'Largest sequence: {self.largest_seq}\n'
+            '________________________\n'
+        )
+        if not autometa_details:
+            return
+        print(
+            'Autometa Details\n'
+            '________________________\n'
+            f'Outdir: {self.outdir}\n'
+            f'ORFs called: {self.orfs_called}\n'
+            f'Prots filepath: {self.prot_orfs_fpath}\n'
+            f'Nucl filepath: {self.nucl_orfs_fpath}\n'
+            f'Taxonomy method: {self.taxon_method}\n'
+            f'Taxonomy assigned: {self.taxonomy_assigned}\n'
+            f'Taxonomy filepath: {self.taxonomy_fpath}\n'
+        )
 
     @timeit
     def length_filter(self, out, cutoff=3000):
@@ -279,6 +387,7 @@ def length_filter(self, out, cutoff=3000):
             cutoff value must be a positive real number
         FileExistsError
             filepath consisting of sequences that passed filter already exists
+
         """
         if not isinstance(cutoff, numbers.Number) or isinstance(cutoff, bool):
             # https://stackoverflow.com/a/4187220/13118765
@@ -335,6 +444,7 @@ def call_orfs(self, force=False, cpus=0, parallel=True):
             `force`,`parallel` or `cpus` type was incorrectly supplied.
         OSError
             ORF calling failed.
+
         """
         for arg, argname in zip([force, parallel],['force','parallel']):
             if not isinstance(arg, bool) and isinstance(arg, numbers.Number):
@@ -376,6 +486,7 @@ def orfs(self, orf_type='prot', cpus=0):
         -------
         ValueError
             Invalid `orf_type`. Choices=['prot','nucl']
+
         """
         if not self.orfs_called:
             self.call_orfs(cpus=cpus)
@@ -408,6 +519,7 @@ def get_kmers(self, kmer_size=5, multiprocess=True, out=None, normalized=None,
 
         TODO: get_kmers should handle both files and SeqRecords...
         NOTE: above TODO should be handled in kmers.py not here...
+
         """
         out_specified = out is not None
         out_exists = os.path.exists(out) if out else False
@@ -617,7 +729,7 @@ def main():
         level=logger.DEBUG)
 
     parser = argparse.ArgumentParser(description="""
-    This handles filtering by length and taxon as well as ORF calling,
+    This script handles filtering by length and taxon as well as ORF calling,
     and various metagenome statistics.
     """,
     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
diff --git a/docs/template.py b/docs/template.py
index 4db513121..0d6803692 100644
--- a/docs/template.py
+++ b/docs/template.py
@@ -86,7 +86,12 @@ def main():
         format='[%(asctime)s %(levelname)s] %(name)s: %(message)s',
         datefmt='%m/%d/%Y %I:%M:%S %p',
         level=logger.DEBUG)
-    parser = argparse.ArgumentParser(description='Concise description of script.')
+    # Note: If you do not have any defaults corresponding to your parameters,
+    # you may remove the formatter class: ArgumentDefaultsHelpFormatter
+    # to reduce help text verbosity.
+    parser = argparse.ArgumentParser(
+        description='Concise description of script.',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('positional',help='<help text of positional arg>')
     parser.add_argument('--optional',help='<help text of optional arg>')
     parser.add_argument(

From 3033b0d1f526ddb4c07919fccc7545fdc711f905 Mon Sep 17 00:00:00 2001
From: EvanRees <erees@wisc.edu>
Date: Wed, 13 May 2020 16:33:40 -0500
Subject: [PATCH 4/4] :memo: Update describe property GC statement. * :art:
 Change os.stat(fpath).st_size to os.path.getsize(fpath)

---
 autometa/common/metabin.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/autometa/common/metabin.py b/autometa/common/metabin.py
index dc8f291bf..c8e70de05 100644
--- a/autometa/common/metabin.py
+++ b/autometa/common/metabin.py
@@ -265,7 +265,7 @@ def prepared(self, fpath):
 
         """
         # COMBAK: Add checkpointing checksum check here
-        if os.path.exists(fpath) and os.stat(fpath).st_size > 0:
+        if os.path.exists(fpath) and os.path.getsize(fpath) > 0:
             return True
         return False
 
@@ -325,18 +325,17 @@ def describe(self, autometa_details=True):
             If True, will output autometa related information (the default is True).
 
         """
-
         print('Metabin Details\n'
             '________________________\n'
             f'Num. Contigs: {self.nseqs:,} / {self.nallseqs:,} ({self.seqs_pct:4.2f}% of total metagenome)\n'
             f'Num. Nucl. ORFs: {self.nnucls:,}\n'
             f'Num. Prot. ORFs: {self.nprots:,}\n'
             f'Size: {self.size:,}bp / {self.totalsize:,}bp ({self.size_pct:4.2f}% of total metagenome)\n'
-            f'Mean GC content: {self.length_weighted_gc:4.2f}%\n'
-            '________________________\n'
-            )
-        if autometa_details:
-            print(
+            f'Length Weighted Avg. GC content: {self.length_weighted_gc:4.2f}%\n'
+            '________________________\n')
+        if not autometa_details:
+            return
+        print(
             'Autometa Details\n'
             '________________________\n'
             f'Metagenome: {self.assembly}\n'