From bee13cc205e769e7931339eb4ba57db0bfc2a4f9 Mon Sep 17 00:00:00 2001 From: EvanRees Date: Tue, 5 May 2020 11:21:13 -0500 Subject: [PATCH 1/4] Add entrypoint functionality. Update docstrings. * resolves issue-#54 * :memo: Update docstrings missing for functions. * :racehorse: Add cache for properties reading assembly sequences. * :art: Add Entrypoint functionality for incorporation to packaging/distribution. * :art::bug: Update main() to handle updated functions. --- autometa/common/mag.py | 57 ++++--- autometa/common/metagenome.py | 294 +++++++++++++++++++++++----------- autometa/taxonomy/ncbi.py | 2 +- 3 files changed, 235 insertions(+), 118 deletions(-) diff --git a/autometa/common/mag.py b/autometa/common/mag.py index 102266e17..194294eb7 100644 --- a/autometa/common/mag.py +++ b/autometa/common/mag.py @@ -32,7 +32,9 @@ import numpy as np from Bio import SeqIO +from Bio.SeqIO.FastaIO import SimpleFastaParser from Bio import SeqUtils +from functools import lru_cache from autometa.common.markers import Markers,MARKERS_DIR from autometa.common import kmers @@ -63,8 +65,29 @@ def __init__(self, assembly, contigs, outdir=None): self.nseqs = len(self.contigs) @property + @lru_cache(maxsize=None) + def assembly_seqs(self): + with open(self.assembly) as fh: + return [seq for title,seq in SimpleFastaParser(fh)] + + @property + @lru_cache(maxsize=None) + def mag_seqs(self): + """Retrieve sequences from assembly contained in `self.contigs`. + + Returns + ------- + list + list of SeqIO [SeqRecords, ...] + + """ + return [seq for seq in SeqIO.parse(self.assembly, 'fasta') + if seq.id in self.contigs] + + @property + @lru_cache(maxsize=None) def totalsize(self): - return sum([len(rec) for rec in self.get_seqs(all=True)]) + return sum(len(rec) for rec in self.assembly_seqs) @property def size_pct(self): @@ -72,19 +95,23 @@ def size_pct(self): @property def nallseqs(self): - return len(self.get_seqs(all=True)) + return len(self.assembly_seqs) @property def seqs_pct(self): return self.nseqs / self.nallseqs * 100 @property + @lru_cache(maxsize=None) def size(self): - return sum(len(seq) for seq in self.get_seqs()) + return sum(len(seq) for seq in self.mag_seqs) @property - def gc_content(self): - return np.mean([SeqUtils.GC(rec.seq) for rec in self.get_seqs()]) + @lru_cache(maxsize=None) + def length_weighted_gc(self): + weights = [len(rec.seq)/self.size for rec in self.mag_seqs] + gc_content = [SeqUtils.GC(rec.seq) for rec in self.mag_seqs] + return np.average(a=gc_content, weights=weights) @property def nucl_orfs_exist(self): @@ -143,26 +170,6 @@ def write_orfs(self, fpath, orf_type='prot'): records = self.get_orfs(orf_type=orf_type) SeqIO.write(records, fpath, 'fasta') - def get_seqs(self, all=False): - """Retrieve sequences from assembly. - - Parameters - ---------- - all : bool - Gets all sequences from assembly if True else sequences for MAG - (the default is False). - - Returns - ------- - list - list of SeqIO [SeqRecords, ...] - - """ - if all: - return [seq for seq in SeqIO.parse(self.assembly, 'fasta')] - return [seq for seq in SeqIO.parse(self.assembly, 'fasta') - if seq.id in self.contigs] - def describe(self, autometa_details=True): print(f''' M.A.G. Details diff --git a/autometa/common/metagenome.py b/autometa/common/metagenome.py index 4810825b0..55cafa0be 100644 --- a/autometa/common/metagenome.py +++ b/autometa/common/metagenome.py @@ -25,7 +25,9 @@ """ +import decimal import logging +import numbers import os import numpy as np @@ -34,6 +36,7 @@ from Bio import SeqIO from Bio.SeqIO.FastaIO import SimpleFastaParser from Bio import SeqUtils +from functools import lru_cache from autometa.common import kmers from autometa.common import coverage @@ -91,10 +94,10 @@ class Metagenome: [SeqRecord,...] nseqs : int Number of sequences in assembly. - mean_gc : float - Mean GC% of assembly. + length_weighted_gc : float + Length weighted average GC% of assembly. size : int - Total assembly size. + Total assembly size in bp. largest_seq : str id of longest sequence in assembly nucls : list @@ -136,11 +139,13 @@ def __str__(self): return self.assembly @property + @lru_cache(maxsize=None) def sequences(self): with open(self.assembly) as fh: return [seq for title,seq in SimpleFastaParser(fh)] @property + @lru_cache(maxsize=None) def seqrecords(self): return [seq for seq in SeqIO.parse(self.assembly, 'fasta')] @@ -149,8 +154,11 @@ def nseqs(self): return len(self.sequences) @property - def mean_gc(self): - return np.mean([SeqUtils.GC(seq) for seq in self.sequences]) + @lru_cache(maxsize=None) + def length_weighted_gc(self): + weights = [len(seq)/self.size for seq in self.sequences] + gc_counts = [SeqUtils.GC(seq) for seq in self.sequences] + return np.average(a=gc_counts, weights=weights) @property def size(self): @@ -168,6 +176,7 @@ def largest_seq(self): @property def orfs_called(self): + # COMBAK: Add checkpointing checksum check here for fp in [self.prot_orfs_fpath, self.nucl_orfs_fpath]: if not os.path.exists(fp): return False @@ -176,15 +185,18 @@ def orfs_called(self): return True @property + @lru_cache(maxsize=None) def nucls(self): return self.orfs(orf_type='nucl') @property + @lru_cache(maxsize=None) def prots(self): return self.orfs(orf_type='prot') @property def taxonomy_assigned(self): + # COMBAK: Add checkpointing checksum check here if os.path.exists(self.taxonomy_fpath) and os.stat(self.taxonomy_fpath).st_size > 0: return True return False @@ -226,7 +238,7 @@ def describe(self): N50: {self.fragmentation_metric():,} bp N10: {self.fragmentation_metric(.1):,} bp N90: {self.fragmentation_metric(.9):,} bp -Mean GC: {self.mean_gc:4.2f}% +Length Weighted Avg. GC content: {self.length_weighted_gc:4.2f}% Largest sequence: {self.largest_seq} ________________________ Autometa Details @@ -244,6 +256,11 @@ def describe(self): def length_filter(self, out, cutoff=3000): """Filters sequences by length with provided cutoff. + Note: A WARNING will be emitted if the length filter is applied *after* + the ORFs provided for the Metagenome are already called prompting the + user to perform orf calling again to correspond to length filtered + contigs. + Parameters ---------- cutoff : int @@ -263,11 +280,8 @@ def length_filter(self, out, cutoff=3000): FileExistsError filepath consisting of sequences that passed filter already exists """ - try: - cutoff = float(cutoff) - except Exception as err: - pass - if not type(cutoff) in [int, float]: + if not isinstance(cutoff, numbers.Number) or isinstance(cutoff, bool): + # https://stackoverflow.com/a/4187220/13118765 raise TypeError(f'cutoff: {cutoff} must be a float or int') if cutoff <= 0: raise ValueError(f'cutoff: {cutoff} must be a positive real number') @@ -280,7 +294,12 @@ def length_filter(self, out, cutoff=3000): if not os.path.exists(gunzipped_fpath): gunzip(self.assembly, gunzipped_fpath) self.assembly = gunzipped_fpath + logger.info(f'Getting contigs greater than or equal to {cutoff:,} bp') records = [seq for seq in self.seqrecords if len(seq) >= cutoff] + if self.orfs_called: + msg = (f'{self.nucl_orfs_fpath} and {self.prot_orfs_fpath} have already been called!' + 'Call orfs again to retrieve only ORFs corresponding to filtered assembly') + logger.warning(msg) SeqIO.write(records, out, 'fasta') return Metagenome( assembly=out, @@ -317,13 +336,13 @@ def call_orfs(self, force=False, cpus=0, parallel=True): OSError ORF calling failed. """ - if type(force) is not bool: - raise TypeError(f'force:({force}) must be a boolean. I.e. True|False') - if type(parallel) is not bool: - raise TypeError(f'parallel:({parallel}) must be a boolean. I.e. True|False') - if type(cpus) is not int: - raise TypeError(f'cpus:({cpus}) must be an integer') - # OPTIMIZE: Should not need to call ORFs on contigs below length cutoff + for arg, argname in zip([force, parallel],['force','parallel']): + if not isinstance(arg, bool) and isinstance(arg, numbers.Number): + raise TypeError(f'{argname} must be a boolean!') + if not isinstance(cpus, int) or isinstance(cpus, bool): + raise TypeError(f'cpus:({cpus}) must be an integer!') + + # COMBAK: Add checkpointing checksum check here try: nucls_fp, prots_fp = prodigal.run( assembly=self.assembly, @@ -361,7 +380,7 @@ def orfs(self, orf_type='prot', cpus=0): if not self.orfs_called: self.call_orfs(cpus=cpus) if orf_type not in {'prot','nucl'}: - raise ValueError('orf_type must be \'prot\' or \'nucl\'!') + raise ValueError('orf_type must be "prot" or "nucl"!') orfs_fpath = self.prot_orfs_fpath if orf_type == 'prot' else self.nucl_orfs_fpath return [orf for orf in SeqIO.parse(orfs_fpath, 'fasta')] @@ -372,14 +391,14 @@ def get_kmers(self, kmer_size=5, multiprocess=True, out=None, normalized=None, Parameters ---------- - kmer_size : int + kmer_size : int, optional length of k-mer to count (the default is 5). - normalized : str - Perform Centered-log ratio normalization on counted k-mers (the default is None). - and write to provided `normalized` path. - out : str + normalized : str, optional + Perform Centered-log ratio normalization on counted k-mers and write + to provided `normalized` file path (the default is None). + out : str, optional Write counted k-mers to `out` (the default is None). - force : bool + force : bool, optional Overwrite existing k-mers `out` file (the default is False). Returns @@ -396,7 +415,7 @@ def get_kmers(self, kmer_size=5, multiprocess=True, out=None, normalized=None, case2 = out_specified and out_exists and force case3 = out_specified and not out_exists if case1: - logger.warning(f'FileExistsError: {out} force to overwrite. [retrieving]') + logger.warning(f'FileExists: {out} force to overwrite. [retrieving]') return pd.read_csv(out, sep='\t', index_col='contig') normalize_kmers = True if normalized else False logger.info(f'Counting {kmer_size}-mers. Normalize: {normalize_kmers}') @@ -412,10 +431,32 @@ def get_kmers(self, kmer_size=5, multiprocess=True, out=None, normalized=None, normalized_df = kmers.normalize(kmers_df) normalized_df.to_csv(normalized, sep='\t', header=True, index=True) return normalized_df - return kmers_df + else: + return kmers_df @timeit def get_coverages(self, out, from_spades=True, **kwargs): + """Retrieve contig coverages using provided `assembly` and `*_reads` or + if the metagenome assembly was generated from SPAdes, use the k-mer coverages + provided in each contig's header. + + Parameters + ---------- + out : str + + from_spades : bool + Description of parameter `from_spades` (the default is True). + **kwargs : dict + May contain the following keys: 'sam', 'bam', 'lengths', 'bed' + Keys should correspond to their respective alignment files. + 'lengths' is a path to a tab-delimited table of contig and its length. + + Returns + ------- + pd.DataFrame + index=contig cols=['coverage'] + + """ if from_spades: return coverage.from_spades_names(self.seqrecords) return coverage.get( @@ -438,15 +479,23 @@ def assign_taxonomy(self, method, force=False, *args, **kwargs): ---------- force : bool, optional overwrite existing voting method's file (the default is False). - *args : type + *args : list Description of parameter `*args`. - **kwargs : type - Description of parameter `**kwargs`. + **kwargs : dict + May contain the following keys: + + * cpus : int, num. cpus to use + * ncbi : str, + * usepickle : bool, whether to pickle taxonomy-specific files + * verbose : bool, Add verbosity to stream to terminal + * blast : str, + * hits : str, + * force : bool, force overwrite existing results files Raises ------- - ExceptionName - Why the exception is raised. + NotImplementedError + Provided `method` not yet implemented. """ logger.debug(f'assigning taxonomy via {method}') @@ -477,9 +526,17 @@ def get_kingdoms(self, **kwargs): Parameters ---------- - **kwargs : dict - Optional additional keyword arguments to supply to assign_taxonomy - and supply a separate NCBI_DIR + **kwargs : dict, optional + May contain the following keys: + + * cpus : int, num. cpus to use + * ncbi : str, + * usepickle : bool, whether to pickle taxonomy-specific files + * verbose : bool, Add verbosity to stream to terminal + * blast : str, + * hits : str, + * force : bool, force overwrite existing results files + Returns ------- @@ -506,6 +563,7 @@ def get_kingdoms(self, **kwargs): left_on='taxid', right_index=True) self.taxonomy.to_csv(self.taxonomy_fpath,sep='\t',index=True,header=True) + # COMBAK: Add checkpointing checksum check here logger.debug(f'Added canonical rank names to {self.taxonomy_fpath}') if 'superkingdom' not in self.taxonomy.columns: raise KeyError(f'superkingdom is not in taxonomy columns {self.taxonomy.columns}') @@ -526,7 +584,7 @@ def write_ranks(self, rank='superkingdom'): Returns ------- list - [rank_name_fp, ...] + [rank_name_fpath, ...] Raises ------- @@ -537,88 +595,140 @@ def write_ranks(self, rank='superkingdom'): if rank not in NCBI.CANONICAL_RANKS: raise ValueError(f'rank: {rank} not in {NCBI.CANONICAL_RANKS}') fpaths = [] - for rank_name,dff in self.taxonomy.groupby(rank): - records = [r for r in SeqIO.parse(self.assembly, 'fasta') if r.id in dff.index] + for rank_name, dff in self.taxonomy.groupby(rank): + records = [r for r in self.seqrecords if r.id in dff.index] rank_name = rank_name.replace(' ','_') rank_name_fname = '.'.join([rank_name.title(),'fna']) - rank_name_fp = os.path.join(self.outdir, rank_name_fname) + rank_name_fpath = os.path.join(self.outdir, rank_name_fname) if not records: - logger.warning(f'No records to write to {rank_name_fp}') + logger.warning(f'No records to write to {rank_name_fpath}') else: - n_written = SeqIO.write(records, rank_name_fp, 'fasta') - logger.debug(f'Wrote {n_written} records to {rank_name_fp}') - fpaths.append(rank_name_fp) + n_written = SeqIO.write(records, rank_name_fpath, 'fasta') + logger.debug(f'Wrote {n_written} records to {rank_name_fpath}') + fpaths.append(rank_name_fpath) return fpaths def main(): import argparse import logging as logger logger.basicConfig( - format='%(asctime)s : %(name)s : %(levelname)s : %(message)s', + format='[%(asctime)s %(levelname)s] %(name)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logger.DEBUG) + parser = argparse.ArgumentParser(description=""" - Metagenome class holding attributes and methods to manipulate metagenome assemblies. - """) - parser.add_argument('assembly', help='') - parser.add_argument('--ncbi', help='', required=True) - parser.add_argument('--cutoff', help='length to filter sequences',default=3000, - type=int) - parser.add_argument('--kmer-size', default=5, type=int) - parser.add_argument( - '--kmer-normalized', - help='Perform CLR transform on k-mer frequencies if provided. ()', - ) - parser.add_argument( - '--kmer-fpath', - help='', - ) + This handles filtering by length and taxon as well as ORF calling, + and various metagenome statistics. + """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('assembly', help='Path to metagenome assembly (nucleotide fasta).') + parser.add_argument('outdir', help='Output directory to store annotations.') + parser.add_argument('nucls', + help='Path to nucleotide ORFs corresponding to `assembly.` ' + '(Will write to path if ORFs do not exist).', + type=str) + parser.add_argument('prots', + help='Path to amino acid ORFs corresponding to `assembly.` ' + '(Will write to path if ORFs do not exist).', + type=str) + parser.add_argument('--filter', + help='Filter to apply to metagenome. You may supply multiple filters.', + choices=['length','taxon'], nargs='*') + parser.add_argument('--cutoff', help='Cutoff to apply to length filter', + default=3000, + type=int, + metavar='') + parser.add_argument('--length-fname', + help='Filename to assign fasta after applying length filter.', + default='`assembly`.filtered.fna') parser.add_argument('--taxon-method', default='majority_vote', choices=['majority_vote']) - parser.add_argument('--vote-fname', help='', + parser.add_argument('--taxon-fname', + help='Filename to assign voted taxonomy annotation.', default='taxonomy_vote.tsv') + parser.add_argument('--ncbi', + help='Path to NCBI databases directory.', + default=NCBI_DIR) + parser.add_argument('--pickle', help='Whether to serialize taxonomy-specific files', + action='store_true', default=False) + parser.add_argument('--blast', + help='Path to diamond blast results.tsv. (outfmt=6) ' + '(Will write to path if it does not exist).', + default=None) + parser.add_argument('--hits', + help='Path to diamond blast hits.pkl.gz. ' + '(Will write to path if it does not exist).', + default=None) # Eventually will need to create a subparser for the taxon assignment methods # to include help information and required parameters according to method. - parser.add_argument('--cpus', help='num cpus to use', default=0) - parser.add_argument('--noparallel', help="Do not use GNU parallel", - action='store_false', default=True) - parser.add_argument('--force', help="overwrite existing files", + parser.add_argument('--cpus', help='Number of cpus to use when performing ' + 'annotations (Default will use all possible if `--parallel` is supplied).', + default=0, type=int) + parser.add_argument('--parallel', help='Use GNU parallel when performing annotations.', action='store_true', default=False) - parser.add_argument('--verbose', help="add verbosity", + parser.add_argument('--force', help="Overwrite existing files", action='store_true', default=False) + parser.add_argument('--verbose', help="Log more information to terminal.", + action='store_true', default=False) + parser.add_argument('--stats', help="Print various metagenome assembly statistics.", + action='store_true', default=False) + args = parser.parse_args() - raw_mags = Metagenome(args.assembly) - mags = raw_mags.length_filter(cutoff=args.cutoff) - logger.info(f'{args.cutoff:,}bp length filter {raw_mags.nseqs:,} to {mags.nseqs:,} seqs') - logger.info(f'CallingORFs: force:{args.force}. cpus:{args.cpus} parallel:{args.noparallel}') + + taxonomy_fpath = os.path.join(args.outdir, args.taxon_fname) + raw_mg = Metagenome( + assembly=args.assembly, + outdir=args.outdir, + prot_orfs_fpath=args.prots, + nucl_orfs_fpath=args.nucls, + taxonomy_fpath=taxonomy_fpath, + taxon_method=args.taxon_method) + + if args.filter and 'length' in args.filter: + if args.length_fname != '`assembly`.filtered.fna': + length_filtered_fpath = os.path.join(raw_mg.outdir, args.length_fname) + else: + basename = os.path.basename(raw_mg.assembly) + fname = os.path.splitext(basename)[0] + length_fname = f'{fname}.filtered.fna' + length_filtered_fpath = os.path.join(raw_mg.outdir, length_fname) + try: + mg = raw_mg.length_filter(out=length_filtered_fpath, cutoff=args.cutoff) + except FileExistsError as err: + logger.warning(f'FileExistsError: {length_filtered_fpath}. Skipping...') + mg = Metagenome( + assembly=length_filtered_fpath, + outdir=raw_mg.outdir, + nucl_orfs_fpath=raw_mg.nucl_orfs_fpath, + prot_orfs_fpath=raw_mg.prot_orfs_fpath, + taxonomy_fpath=raw_mg.taxonomy_fpath, + taxon_method=raw_mg.taxon_method) + else: + mg = raw_mg + + if args.stats: + mg.describe() + try: - mags.call_orfs( + mg.call_orfs( force=args.force, - verbose=args.verbose, cpus=args.cpus, - parallel=args.noparallel, + parallel=args.parallel, ) except FileExistsError as err: - logger.warning(f'FileExistsError: {mags.prots_out}. Skipping...') - logger.info(f'TaxonAssignment - method:{args.taxon_method}') - mags.get_kingdoms( - method=args.taxon_method, - fasta=mags.prots_out, - ncbi_dir=args.ncbi, - outdir=mags.dirname, - blast=None, - usepickle=True, - verbose=args.verbose, - ) - logger.info(f'Getting k-mers of size {args.kmer_size}. Normalize: {args.kmer_normalized}') - kmer_fpath = args.kmer_fpath if args.kmer_fpath else os.path.join(mags.dirname, 'kmers.tsv') - kmers_df = mags.get_kmers( - kmer_size=args.kmer_size, - normalized=args.kmer_normalized, - out=kmer_fpath, - force=args.force, - ) - logger.info('Done') + logger.warning(f'FileExistsError: {mg.prots_out}. Skipping...') + + if args.filter and 'taxon' in args.filter: + logger.info(f'TaxonAssignment - method:{args.taxon_method}') + mg.get_kingdoms( + cpus=args.cpus, + ncbi=args.ncbi, + usepickle=args.pickle, + verbose=args.verbose, + blast=args.blast, + hits=args.hits, + force=args.force, + ) if __name__ == '__main__': main() diff --git a/autometa/taxonomy/ncbi.py b/autometa/taxonomy/ncbi.py index a9f577f32..8bb886d24 100644 --- a/autometa/taxonomy/ncbi.py +++ b/autometa/taxonomy/ncbi.py @@ -41,7 +41,7 @@ logger = logging.getLogger(__name__) -BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) +BASE_DIR = os.path.dirname(os.path.dirname(__file__)) NCBI_DIR = os.path.join(BASE_DIR,'databases','ncbi') class NCBI: From b06178bb5a8ce784fb9ba844b33afbe678ed30d0 Mon Sep 17 00:00:00 2001 From: EvanRees Date: Tue, 5 May 2020 17:22:43 -0500 Subject: [PATCH 2/4] Update MAG class to MetaBin. * :memo: Add docstrings to methods and class * :art::fire: Remove split_nucleotides method. * :art: Update get_binning function to handle coverages as filepath not pd.DataFrame * :art: Change mag.py to metabin.py * :racehorse: fixed from PR-#66. Add cache functionality to time-consuming property methods. * :memo: Add COMBAK comment for checkpointing. Return to this when implemented in utilities.py --- autometa/common/{mag.py => metabin.py} | 276 ++++++++++++++++++++----- autometa/common/metagenome.py | 6 +- autometa/config/user.py | 2 +- 3 files changed, 230 insertions(+), 54 deletions(-) rename autometa/common/{mag.py => metabin.py} (55%) diff --git a/autometa/common/mag.py b/autometa/common/metabin.py similarity index 55% rename from autometa/common/mag.py rename to autometa/common/metabin.py index 194294eb7..dc8f291bf 100644 --- a/autometa/common/mag.py +++ b/autometa/common/metabin.py @@ -46,17 +46,44 @@ logger = logging.getLogger(__name__) -class MAG: - """docstring for Autometa MAG class.""" +class MetaBin: + """Autometa MetaBin class to manipulate/annotate contigs from a metagenome. + + Parameters + ---------- + assembly : str + + contigs : list + List of contigs to manipulate/annotate (must be contained in metagenome). + outdir : str, optional + (Default is the directory storing the `assembly`). + + Attributes + ---------- + basename : str + base name of `assembly` + root : str + root name of `assembly` (Will remove common extension like '.fasta') + nucls_fname : str + File name of `contigs` nucleotide ORFs + prots_fname : str + File name of `contigs` amino-acid ORFs + nucl_orfs_fpath : str + + prot_orfs_fpath : str + + nseqs : int + Number of contigs in MetaBin + + """ def __init__(self, assembly, contigs, outdir=None): self.assembly = os.path.realpath(assembly) - self.outdir = os.path.realpath(outdir) if outdir else os.path.dirname(self.assembly) self.basename = os.path.basename(self.assembly) - self.assembly_name = self.basename.split('.')[0] + self.root = os.path.splitext(self.basename)[0] + self.outdir = os.path.realpath(outdir) if outdir else os.path.dirname(self.assembly) nucls_ext = 'orfs.fna' prots_ext = 'orfs.faa' - self.root = os.path.splitext(self.basename)[0] self.nucls_fname = '.'.join([self.root, nucls_ext]) self.prots_fname = '.'.join([self.root, prots_ext]) self.nucl_orfs_fpath = os.path.join(self.outdir, self.nucls_fname) @@ -67,13 +94,21 @@ def __init__(self, assembly, contigs, outdir=None): @property @lru_cache(maxsize=None) def assembly_seqs(self): + """Retrieve all of the sequences from the provided `assembly`. + + Returns + ------- + list + list of str [seq, ...] + + """ with open(self.assembly) as fh: return [seq for title,seq in SimpleFastaParser(fh)] @property @lru_cache(maxsize=None) - def mag_seqs(self): - """Retrieve sequences from assembly contained in `self.contigs`. + def seqrecords(self): + """Retrieve seqrecords from assembly contained in `self.contigs`. Returns ------- @@ -87,59 +122,155 @@ def mag_seqs(self): @property @lru_cache(maxsize=None) def totalsize(self): + """Get the total assembly size in bp. + + Returns + ------- + int + Sum of sequence lengths in `assembly` + + """ return sum(len(rec) for rec in self.assembly_seqs) @property def size_pct(self): + """Get the size of the `contigs` in reference to the `assembly` size. + This is expressed as a percentage + + Returns + ------- + float + Size percentage of `contigs` covering the total `assembly` size. + + """ return self.size / self.totalsize * 100 @property def nallseqs(self): + """Get the total number of sequences in `assembly`. + + Returns + ------- + int + Number of sequences contained in `assembly` + + """ return len(self.assembly_seqs) @property def seqs_pct(self): + """Get the percentage of the number of `contigs` compared to the number + of sequences in the `assembly`. + + Returns + ------- + float + percentage of `contigs` corresponding to num. seqs. in `assembly`. + + """ return self.nseqs / self.nallseqs * 100 @property @lru_cache(maxsize=None) def size(self): - return sum(len(seq) for seq in self.mag_seqs) + """Get the summation of the lengths of `contigs` in bp. + + Returns + ------- + int + Summation of lengths of `contigs` in bp. + + """ + return sum(len(seq) for seq in self.seqrecords) @property @lru_cache(maxsize=None) def length_weighted_gc(self): - weights = [len(rec.seq)/self.size for rec in self.mag_seqs] - gc_content = [SeqUtils.GC(rec.seq) for rec in self.mag_seqs] + """Get the length weighted GC content of `contigs`. + + Returns + ------- + float + GC percentage weighted by contig length. + + """ + weights = [len(rec.seq)/self.size for rec in self.seqrecords] + gc_content = [SeqUtils.GC(rec.seq) for rec in self.seqrecords] return np.average(a=gc_content, weights=weights) @property def nucl_orfs_exist(self): + """Determine whether `nucl_orfs_fpath` exists. + + Returns + ------- + bool + True is returned if `nucl_orfs_fpath` is a valid path, otherwise False. + + """ return self.prepared(self.nucl_orfs_fpath) @property def prot_orfs_exist(self): + """Determine whether `prot_orfs_fpath` exists. + + Returns + ------- + bool + True is returned if `prot_orfs_fpath` is a valid path, otherwise False. + + """ return self.prepared(self.prot_orfs_fpath) @property def nnucls(self): + """Get the number of nucleotide ORFs in `nucl_orfs_fpath`. + + Returns + ------- + int + Number of nucleotide ORFs in `nucl_orfs_fpath`. + + """ if not self.nucl_orfs_exist: return np.nan return len(self.get_orfs('nucl')) @property def nprots(self): + """Get the number of amino-acid ORFs in `prot_orfs_fpath`. + + Returns + ------- + int + Number of amino-acid ORFs in `prot_orfs_fpath`. + + """ if not self.prot_orfs_exist: return np.nan return len(self.get_orfs('prot')) def prepared(self, fpath): + """Check whether `fpath` exists and is valid via checksum in checkpoints. + + Parameters + ---------- + fpath : str + + + Returns + ------- + bool + Whether provided `fpath` has been generated and is valid + + """ + # COMBAK: Add checkpointing checksum check here if os.path.exists(fpath) and os.stat(fpath).st_size > 0: return True return False def get_orfs(self, orf_type='prot'): - """Retrieve ORFs corresponding to MAG. + """Retrieve ORFs corresponding to MetaBin. Parameters ---------- @@ -154,12 +285,17 @@ def get_orfs(self, orf_type='prot'): Raises ------- + KeyError + `orf_type` not in ['prot','nucl'] FileNotFoundError - Why the exception is raised. + Either `self.prot_orfs_fpath` or `self.nucl_orfs_fpath` not found. + This will correspond to `orf_type` """ orfs_fpaths = {'prot':self.prot_orfs_fpath, 'nucl':self.nucl_orfs_fpath} - orfs_fpath = orfs_fpaths.get(orf_type, 'prot') + if orf_type not in orfs_fpaths: + raise KeyError(f'{orf_type} not in {orfs_fpaths}') + orfs_fpath = orfs_fpaths[orf_type] if not self.prepared(orfs_fpath): raise FileNotFoundError(orfs_fpath) return prodigal.orf_records_from_contigs( @@ -167,37 +303,55 @@ def get_orfs(self, orf_type='prot'): fpath=self.prot_orfs_fpath) def write_orfs(self, fpath, orf_type='prot'): + """Write `orf_type` ORFs from `contigs` to `fpath`. + + Parameters + ---------- + fpath : str + + * embedded : str, + * do_pca : bool, Perform PCA prior to embedding + * embedding_method : str, Embedding method to use choices=['sksne','bhsne','umap'] + * perplexity : float, perplexity setting for embedding method sksne/bhsne + * coverage : str, + * taxonomy : str, + * domain : str, kindom to bin choices=['bacteria','archaea'] + * purity : float, purity cutoff to apply to bins + * completeness : float, completeness cutoff to apply to bins + * reverse : bool, If true will bin taxa from least to most specific rank + Returns ------- pd.DataFrame @@ -213,8 +379,8 @@ def get_binning(self, method='recursive_dbscan', **kwargs): Raises ------- - ExceptionName - Why the exception is raised. + NotImplementedError + Provided `method` is not yet implemented. """ if method == 'recursive_dbscan': @@ -224,22 +390,24 @@ def get_binning(self, method='recursive_dbscan', **kwargs): embedded=kwargs.get('embedded'), do_pca=kwargs.get('do_pca',True), pca_dimensions=kwargs.get('pca_dims',50), - method=kwargs.get('embedding_method','UMAP'), + method=kwargs.get('embedding_method','bhsne'), perplexity=kwargs.get('perplexity',30), ) except ValueError as error: logger.exception(error) master_df = embedded_df - coverage_df = kwargs.get('coverage') - if coverage_df is not None or not coverage_df.empty: + coverage_fp = kwargs.get('coverage') + if coverage_fp: + coverage_df = pd.read_csv(coverage_fp, sep='\t', index_col='contig') master_df = pd.merge( master_df, coverage_df, how='left', left_index=True, right_index=True) - if kwargs.get('taxonomy'): - taxa_df = pd.read_csv(kwargs.get('taxonomy'), sep='\t', index_col='contig') + taxonomy_fp = kwargs.get('taxonomy') + if taxonomy_fp: + taxa_df = pd.read_csv(taxonomy_fp, sep='\t', index_col='contig') master_df = pd.merge( left=master_df, right=taxa_df, @@ -283,11 +451,6 @@ def markers(self, kingdom='bacteria', dbdir=MARKERS_DIR, force=False): pd.DataFrame wide format - index_col='contig', columns=[PFAM,...] - Raises - ------- - ExceptionName - Why the exception is raised. - """ logger.debug(f'Retrieving markers for {kingdom} kingdom') @@ -298,6 +461,20 @@ def markers(self, kingdom='bacteria', dbdir=MARKERS_DIR, force=False): return markers.get() def subset_df(self, df): + """Retrieve subset of provided `df` containing only `contigs`. + + Parameters + ---------- + df : pd.DataFrame,pd.Series,str + `df` may be a pandas Series or DataFrame, or a path to file. + If a path is provided, `df` will be read as a tab-delimited table. + + Returns + ------- + pd.DataFrame + index='contig', cols=cols in provided `df`, subset by `contigs`. + + """ if type(df) not in [pd.DataFrame, pd.Series]: raise TypeError(f'Unable to subset df. {type(df)} is not Series or DataFrame') if type(df) is pd.DataFrame: @@ -313,15 +490,15 @@ def main(): format='%(asctime)s : %(name)s : %(levelname)s : %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logger.DEBUG) - parser = argparse.ArgumentParser(description='Autometa MAG Class') + parser = argparse.ArgumentParser(description='Autometa MetaBin Class') parser.add_argument('--assembly', help='', required=True) - parser.add_argument('--contigs', help='list of contigs in MAG',nargs='+', required=True) + parser.add_argument('--contigs', help='list of contigs in MetaBin',nargs='+', required=True) parser.add_argument('--domain', help='kingdom to use for binning', default='bacteria') parser.add_argument('--kmers', help=' Date: Wed, 13 May 2020 16:25:18 -0500 Subject: [PATCH 3/4] :memo: Add docstrings to class properties. * :memo::art: Add defaults formatter_class to template.py --- autometa/common/metagenome.py | 182 +++++++++++++++++++++++++++------- docs/template.py | 7 +- 2 files changed, 153 insertions(+), 36 deletions(-) diff --git a/autometa/common/metagenome.py b/autometa/common/metagenome.py index 55cafa0be..396cab5de 100644 --- a/autometa/common/metagenome.py +++ b/autometa/common/metagenome.py @@ -107,15 +107,16 @@ class Metagenome: Methods ---------- - - self.fragmentation_metric() - - self.describe() - - self.length_filter() - - self.call_orfs() - - self.orfs() - - self.get_kmers() - - self.assign_taxonomy() - - self.get_kingdoms() - - self.write_ranks() + * self.fragmentation_metric() + * self.describe() + * self.length_filter() + * self.call_orfs() + * self.orfs() + * self.get_kmers() + * self.assign_taxonomy() + * self.get_kingdoms() + * self.write_ranks() + """ def __init__(self, assembly, outdir, nucl_orfs_fpath, prot_orfs_fpath, taxonomy_fpath, taxon_method='majority_vote', fwd_reads=None, @@ -141,31 +142,79 @@ def __str__(self): @property @lru_cache(maxsize=None) def sequences(self): + """Retrieve the sequences from provided `assembly`. + + Returns + ------- + list + [seq, seq, ...] + + """ with open(self.assembly) as fh: return [seq for title,seq in SimpleFastaParser(fh)] @property @lru_cache(maxsize=None) def seqrecords(self): + """Retrieve SeqRecord objects from provided `assembly`. + + Returns + ------- + list + [SeqRecord, SeqRecord, ...] + + """ return [seq for seq in SeqIO.parse(self.assembly, 'fasta')] @property def nseqs(self): + """Retrieve the number of sequences in provided `assembly`. + + Returns + ------- + int + Number of sequences parsed from `assembly` + + """ return len(self.sequences) @property @lru_cache(maxsize=None) def length_weighted_gc(self): + """Retrieve the length weighted average GC percentage of provided `assembly`. + + Returns + ------- + float + GC percentage weighted by contig length. + + """ weights = [len(seq)/self.size for seq in self.sequences] gc_counts = [SeqUtils.GC(seq) for seq in self.sequences] return np.average(a=gc_counts, weights=weights) @property def size(self): + """Retrieve the summation of sizes for each contig in the provided `assembly`. + + Returns + ------- + int + Total summation of contig sizes in `assembly` + + """ return sum(len(seq) for seq in self.sequences) @property def largest_seq(self): + """Retrieve the name of the largest sequence in the provided `assembly`. + + Returns + ------- + str + record ID of the largest sequence in `assembly`. + + """ max = float('-inf') largest = None for rec in self.seqrecords: @@ -176,28 +225,72 @@ def largest_seq(self): @property def orfs_called(self): + """Retrieve whether `prot_orfs_fpath` and `nucl_orfs_fpath` have been called. + + Note: This will check whether the aforementioned paths exist and are not empty. + In the future, a checksum comparison will be performed to ensure file integrity. + + Returns + ------- + bool + Description of returned object. + + """ # COMBAK: Add checkpointing checksum check here for fp in [self.prot_orfs_fpath, self.nucl_orfs_fpath]: if not os.path.exists(fp): return False - elif not os.stat(fp).st_size > 0: + elif not os.path.getsize(fp) > 0: return False return True @property @lru_cache(maxsize=None) def nucls(self): + """Retrieve `assembly` nucleotide ORFs. + + Returns + ------- + list + [SeqRecord, SeqRecord, ...] + + """ return self.orfs(orf_type='nucl') @property @lru_cache(maxsize=None) def prots(self): + """Retrieve `assembly` amino-acid ORFs. + + Returns + ------- + list + [SeqRecord, SeqRecord, ...] + + """ return self.orfs(orf_type='prot') @property def taxonomy_assigned(self): + """Retrieve whether taxonomy has been assigned to `assembly`. This will + check whether `taxonomy_fpath` exists and is non-empty. + + Note: In the future, a checksum comparison should be performed to ensure + file integrity. + + Returns + ------- + type + Description of returned object. + + Raises + ------- + ExceptionName + Why the exception is raised. + + """ # COMBAK: Add checkpointing checksum check here - if os.path.exists(self.taxonomy_fpath) and os.stat(self.taxonomy_fpath).st_size > 0: + if os.path.exists(self.taxonomy_fpath) and os.path.getsize(self.taxonomy_fpath) > 0: return True return False @@ -228,29 +321,44 @@ def fragmentation_metric(self, quality_measure=.50): if sum(lengths) > target_size: return length - def describe(self): - print(f""" -Metagenome Details -________________________ -Assembly: {self.assembly} -Num. Sequences: {self.nseqs:,} -Size: {self.size:,} bp -N50: {self.fragmentation_metric():,} bp -N10: {self.fragmentation_metric(.1):,} bp -N90: {self.fragmentation_metric(.9):,} bp -Length Weighted Avg. GC content: {self.length_weighted_gc:4.2f}% -Largest sequence: {self.largest_seq} -________________________ -Autometa Details -________________________ -Outdir: {self.outdir} -ORFs called: {self.orfs_called} -Prots filepath: {self.prot_orfs_fpath} -Nucl filepath: {self.nucl_orfs_fpath} -Taxonomy method: {self.taxon_method} -Taxonomy assigned: {self.taxonomy_assigned} -Taxonomy filepath: {self.taxonomy_fpath} -""") + def describe(self, autometa_details=True): + """Print `assembly` details. + + Parameters + ---------- + autometa_details : bool + Also log Autometa specific information to the terminal (Default is True). + + Returns + ------- + NoneType + + """ + print('Metagenome Details\n' + '________________________\n' + f'Assembly: {self.assembly}\n' + f'Num. Sequences: {self.nseqs:,}\n' + f'Size: {self.size:,} bp\n' + f'N50: {self.fragmentation_metric():,} bp\n' + f'N10: {self.fragmentation_metric(.1):,} bp\n' + f'N90: {self.fragmentation_metric(.9):,} bp\n' + f'Length Weighted Avg. GC content: {self.length_weighted_gc:4.2f}%\n' + f'Largest sequence: {self.largest_seq}\n' + '________________________\n' + ) + if not autometa_details: + return + print( + 'Autometa Details\n' + '________________________\n' + f'Outdir: {self.outdir}\n' + f'ORFs called: {self.orfs_called}\n' + f'Prots filepath: {self.prot_orfs_fpath}\n' + f'Nucl filepath: {self.nucl_orfs_fpath}\n' + f'Taxonomy method: {self.taxon_method}\n' + f'Taxonomy assigned: {self.taxonomy_assigned}\n' + f'Taxonomy filepath: {self.taxonomy_fpath}\n' + ) @timeit def length_filter(self, out, cutoff=3000): @@ -279,6 +387,7 @@ def length_filter(self, out, cutoff=3000): cutoff value must be a positive real number FileExistsError filepath consisting of sequences that passed filter already exists + """ if not isinstance(cutoff, numbers.Number) or isinstance(cutoff, bool): # https://stackoverflow.com/a/4187220/13118765 @@ -335,6 +444,7 @@ def call_orfs(self, force=False, cpus=0, parallel=True): `force`,`parallel` or `cpus` type was incorrectly supplied. OSError ORF calling failed. + """ for arg, argname in zip([force, parallel],['force','parallel']): if not isinstance(arg, bool) and isinstance(arg, numbers.Number): @@ -376,6 +486,7 @@ def orfs(self, orf_type='prot', cpus=0): ------- ValueError Invalid `orf_type`. Choices=['prot','nucl'] + """ if not self.orfs_called: self.call_orfs(cpus=cpus) @@ -408,6 +519,7 @@ def get_kmers(self, kmer_size=5, multiprocess=True, out=None, normalized=None, TODO: get_kmers should handle both files and SeqRecords... NOTE: above TODO should be handled in kmers.py not here... + """ out_specified = out is not None out_exists = os.path.exists(out) if out else False @@ -617,7 +729,7 @@ def main(): level=logger.DEBUG) parser = argparse.ArgumentParser(description=""" - This handles filtering by length and taxon as well as ORF calling, + This script handles filtering by length and taxon as well as ORF calling, and various metagenome statistics. """, formatter_class=argparse.ArgumentDefaultsHelpFormatter) diff --git a/docs/template.py b/docs/template.py index 4db513121..0d6803692 100644 --- a/docs/template.py +++ b/docs/template.py @@ -86,7 +86,12 @@ def main(): format='[%(asctime)s %(levelname)s] %(name)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logger.DEBUG) - parser = argparse.ArgumentParser(description='Concise description of script.') + # Note: If you do not have any defaults corresponding to your parameters, + # you may remove the formatter class: ArgumentDefaultsHelpFormatter + # to reduce help text verbosity. + parser = argparse.ArgumentParser( + description='Concise description of script.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('positional',help='') parser.add_argument('--optional',help='') parser.add_argument( From 3033b0d1f526ddb4c07919fccc7545fdc711f905 Mon Sep 17 00:00:00 2001 From: EvanRees Date: Wed, 13 May 2020 16:33:40 -0500 Subject: [PATCH 4/4] :memo: Update describe property GC statement. * :art: Change os.stat(fpath).st_size to os.path.getsize(fpath) --- autometa/common/metabin.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/autometa/common/metabin.py b/autometa/common/metabin.py index dc8f291bf..c8e70de05 100644 --- a/autometa/common/metabin.py +++ b/autometa/common/metabin.py @@ -265,7 +265,7 @@ def prepared(self, fpath): """ # COMBAK: Add checkpointing checksum check here - if os.path.exists(fpath) and os.stat(fpath).st_size > 0: + if os.path.exists(fpath) and os.path.getsize(fpath) > 0: return True return False @@ -325,18 +325,17 @@ def describe(self, autometa_details=True): If True, will output autometa related information (the default is True). """ - print('Metabin Details\n' '________________________\n' f'Num. Contigs: {self.nseqs:,} / {self.nallseqs:,} ({self.seqs_pct:4.2f}% of total metagenome)\n' f'Num. Nucl. ORFs: {self.nnucls:,}\n' f'Num. Prot. ORFs: {self.nprots:,}\n' f'Size: {self.size:,}bp / {self.totalsize:,}bp ({self.size_pct:4.2f}% of total metagenome)\n' - f'Mean GC content: {self.length_weighted_gc:4.2f}%\n' - '________________________\n' - ) - if autometa_details: - print( + f'Length Weighted Avg. GC content: {self.length_weighted_gc:4.2f}%\n' + '________________________\n') + if not autometa_details: + return + print( 'Autometa Details\n' '________________________\n' f'Metagenome: {self.assembly}\n'