From 940866df85eb9a64cf235486c34461594cbd8d29 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Wed, 27 Apr 2022 11:14:42 -0700 Subject: [PATCH 1/9] init with tax code from #1788 --- src/sourmash/tax/__main__.py | 30 +++-- src/sourmash/tax/tax_utils.py | 150 ++++++++++------------- tests/test-data/tax/test1.gather_ani.csv | 5 + tests/test_tax.py | 118 +++++++++++++----- tests/test_tax_utils.py | 106 +++++++++------- 5 files changed, 242 insertions(+), 167 deletions(-) create mode 100644 tests/test-data/tax/test1.gather_ani.csv diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 01fc6bbe1c..1cee174eb1 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -184,7 +184,8 @@ def genome(args): best_at_rank, seen_perfect = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, - best_only=True, seen_perfect=seen_perfect) + best_only=True, seen_perfect=seen_perfect, estimate_query_ani=True) + except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -194,19 +195,22 @@ def genome(args): status = 'nomatch' if sg.query_name in matched_queries: continue - if sg.fraction <= args.containment_threshold: + if args.ani_threshold and sg.query_ani_at_rank < args.ani_threshold: + status="below_threshold" + notify(f"WARNING: classifying query {sg.query_name} at desired rank {args.rank} does not meet query ANI/AAI threshold {args.ani_threshold}") + elif sg.fraction <= args.containment_threshold: # should this just be less than? status="below_threshold" notify(f"WARNING: classifying query {sg.query_name} at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") else: status="match" - classif = ClassificationResult(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename, sg.f_weighted_at_rank, sg.bp_match_at_rank) + classif = ClassificationResult(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename, sg.f_weighted_at_rank, sg.bp_match_at_rank, sg.query_ani_at_rank) classifications[args.rank].append(classif) matched_queries.add(sg.query_name) if "krona" in args.output_format: lin_list = display_lineage(sg.lineage).split(';') krona_results.append((sg.fraction, *lin_list)) else: - # classify to the match that passes the containment threshold. + # classify to the rank/match that passes the containment threshold. # To do - do we want to store anything for this match if nothing >= containment threshold? for rank in tax_utils.ascending_taxlist(include_strain=False): # gets best_at_rank for all queries in this gather_csv @@ -214,7 +218,7 @@ def genome(args): best_at_rank, seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, - best_only=True, seen_perfect=seen_perfect) + best_only=True, seen_perfect=seen_perfect, estimate_query_ani=True) except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -223,18 +227,26 @@ def genome(args): status = 'nomatch' if sg.query_name in matched_queries: continue - if sg.fraction >= args.containment_threshold: + if args.ani_threshold and sg.query_ani_at_rank >= args.ani_threshold: + status="match" + elif sg.fraction >= args.containment_threshold: status = "match" - classif = ClassificationResult(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename, sg.f_weighted_at_rank, sg.bp_match_at_rank) + if status == "match": + classif = ClassificationResult(query_name=sg.query_name, status=status, rank=sg.rank, + fraction=sg.fraction, lineage=sg.lineage, + query_md5=sg.query_md5, query_filename=sg.query_filename, + f_weighted_at_rank=sg.f_weighted_at_rank, bp_match_at_rank=sg.bp_match_at_rank, + query_ani_at_rank= sg.query_ani_at_rank) classifications[sg.rank].append(classif) matched_queries.add(sg.query_name) continue - if rank == "superkingdom" and status == "nomatch": + elif rank == "superkingdom" and status == "nomatch": status="below_threshold" classif = ClassificationResult(query_name=sg.query_name, status=status, rank="", fraction=0, lineage="", query_md5=sg.query_md5, query_filename=sg.query_filename, - f_weighted_at_rank=sg.f_weighted_at_rank, bp_match_at_rank=sg.bp_match_at_rank) + f_weighted_at_rank=sg.f_weighted_at_rank, bp_match_at_rank=sg.bp_match_at_rank, + query_ani_at_rank=sg.query_ani_at_rank) classifications[sg.rank].append(classif) if not any([classifications, krona_results]): diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index d5a7161afc..0c69eeaf55 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -5,12 +5,7 @@ import csv from collections import namedtuple, defaultdict from collections import abc - -from sourmash import sqlite_utils -from sourmash.exceptions import IndexNotSupported - -import sqlite3 - +from sourmash.distance_utils import containment_to_distance __all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs', 'load_gather_results', 'check_and_load_gather_csvs', @@ -24,9 +19,9 @@ from sourmash.logging import notify from sourmash.sourmash_args import load_pathlist_from_file -QueryInfo = namedtuple("QueryInfo", "query_md5, query_filename, query_bp") -SummarizedGatherResult = namedtuple("SummarizedGatherResult", "query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank") -ClassificationResult = namedtuple("ClassificationResult", "query_name, status, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank") +QueryInfo = namedtuple("QueryInfo", "query_md5, query_filename, query_bp, query_hashes") +SummarizedGatherResult = namedtuple("SummarizedGatherResult", "query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank") +ClassificationResult = namedtuple("ClassificationResult", "query_name, status, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank") # Essential Gather column names that must be in gather_csv to allow `tax` summarization EssentialGatherColnames = ('query_name', 'name', 'f_unique_weighted', 'f_unique_to_query', 'unique_intersect_bp', 'remaining_bp', 'query_md5', 'query_filename') @@ -188,7 +183,7 @@ def find_match_lineage(match_ident, tax_assign, *, skip_idents = [], def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], keep_full_identifiers=False, keep_identifier_versions=False, best_only=False, - seen_perfect=set()): + seen_perfect=set(), estimate_query_ani=False): """ Summarize gather results at specified taxonomic rank """ @@ -198,7 +193,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], sum_uniq_to_query = defaultdict(lambda: defaultdict(float)) sum_uniq_bp = defaultdict(lambda: defaultdict(float)) query_info = {} - + ksize,scaled,query_nhashes=None,None,None for row in gather_results: # get essential gather info query_name = row['query_name'] @@ -207,13 +202,25 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], unique_intersect_bp = int(row['unique_intersect_bp']) query_md5 = row['query_md5'] query_filename = row['query_filename'] - # get query_bp - if query_name not in query_info.keys(): - query_bp = unique_intersect_bp + int(row['remaining_bp']) + if query_name not in query_info.keys(): #REMOVING THIS AFFECTS GATHER RESULTS!!! BUT query bp should always be same for same query? bug? + if "query_nhashes" in row.keys(): + query_nhashes = int(row["query_nhashes"]) + if "query_bp" in row.keys(): + query_bp = int(row["query_bp"]) + else: + query_bp = unique_intersect_bp + int(row['remaining_bp']) # store query info - query_info[query_name] = QueryInfo(query_md5=query_md5, query_filename=query_filename, query_bp=query_bp) - match_ident = row['name'] + query_info[query_name] = QueryInfo(query_md5=query_md5, query_filename=query_filename, query_bp=query_bp, query_hashes = query_nhashes) + + if estimate_query_ani and (not ksize or not scaled): # just need to set these once. BUT, if we have these, should we check for compatibility when loading the gather file? + if "ksize" in row.keys(): # ksize and scaled were added to gather results in same PR + ksize = int(row['ksize']) + scaled = int(row['scaled']) + else: + estimate_query_ani=False + notify("WARNING: Please run gather with sourmash >= 4.3 to estimate query ANI at rank. Continuing without ANI...") + match_ident = row['name'] # 100% match? are we looking at something in the database? if f_unique_to_query >= 1.0 and query_name not in seen_perfect: # only want to notify once, not for each rank @@ -225,27 +232,29 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], # get lineage for match lineage = find_match_lineage(match_ident, tax_assign, - skip_idents=skip_idents, - keep_full_identifiers=keep_full_identifiers, - keep_identifier_versions=keep_identifier_versions) + skip_idents=skip_idents, + keep_full_identifiers=keep_full_identifiers, + keep_identifier_versions=keep_identifier_versions) # ident was in skip_idents if not lineage: continue # summarize at rank! lineage = pop_to_rank(lineage, rank) - assert lineage[-1].rank == rank, (rank, lineage[-1]) + assert lineage[-1].rank == rank, lineage[-1] # record info sum_uniq_to_query[query_name][lineage] += f_unique_to_query sum_uniq_weighted[query_name][lineage] += f_uniq_weighted sum_uniq_bp[query_name][lineage] += unique_intersect_bp + # sort and store each as SummarizedGatherResult sum_uniq_to_query_sorted = [] for query_name, lineage_weights in sum_uniq_to_query.items(): qInfo = query_info[query_name] sumgather_items = list(lineage_weights.items()) sumgather_items.sort(key = lambda x: -x[1]) + query_ani = None if best_only: lineage, fraction = sumgather_items[0] if fraction > 1: @@ -254,13 +263,19 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], continue f_weighted_at_rank = sum_uniq_weighted[query_name][lineage] bp_intersect_at_rank = sum_uniq_bp[query_name][lineage] - sres = SummarizedGatherResult(query_name, rank, fraction, lineage, qInfo.query_md5, qInfo.query_filename, f_weighted_at_rank, bp_intersect_at_rank) + if estimate_query_ani: + query_ani = containment_to_distance(fraction, ksize, scaled, + n_unique_kmers= qInfo.query_hashes, sequence_len_bp= qInfo.query_bp, + return_identity=True)[0] + sres = SummarizedGatherResult(query_name, rank, fraction, lineage, qInfo.query_md5, + qInfo.query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) else: total_f_weighted= 0.0 total_f_classified = 0.0 total_bp_classified = 0 for lineage, fraction in sumgather_items: + query_ani=None if fraction > 1: raise ValueError(f"The tax summary of query '{query_name}' is {fraction}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") elif fraction == 0: @@ -270,16 +285,23 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], total_f_weighted += f_weighted_at_rank bp_intersect_at_rank = int(sum_uniq_bp[query_name][lineage]) total_bp_classified += bp_intersect_at_rank - sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_intersect_at_rank) + if estimate_query_ani: + query_ani = containment_to_distance(fraction, ksize, scaled, + n_unique_kmers=qInfo.query_hashes, sequence_len_bp=qInfo.query_bp, + return_identity=True)[0] + sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, + query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) # record unclassified lineage = () + query_ani=None fraction = 1.0 - total_f_classified if fraction > 0: f_weighted_at_rank = 1.0 - total_f_weighted bp_intersect_at_rank = qInfo.query_bp - total_bp_classified - sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_intersect_at_rank) + sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, + query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) return sum_uniq_to_query_sorted, seen_perfect @@ -628,27 +650,15 @@ def load(cls, filename, *, delimiter=',', force=False, class LineageDB_Sqlite(abc.Mapping): """ - A LineageDB based on a sqlite3 database with a 'sourmash_taxonomy' table. + A LineageDB based on a sqlite3 database with a 'taxonomy' table. """ # NOTE: 'order' is a reserved name in sql, so we have to use 'order_'. columns = ('superkingdom', 'phylum', 'order_', 'class', 'family', 'genus', 'species', 'strain') - table_name = 'sourmash_taxonomy' - def __init__(self, conn, *, table_name=None): + def __init__(self, conn): self.conn = conn - # provide for legacy support for pre-sourmash_internal days... - if table_name is not None: - self.table_name = table_name - - # check that the right table is there. - c = conn.cursor() - try: - c.execute(f'SELECT * FROM {self.table_name} LIMIT 1') - except (sqlite3.DatabaseError, sqlite3.OperationalError): - raise ValueError("not a taxonomy database") - # check: can we do a 'select' on the right table? self.__len__() c = conn.cursor() @@ -656,7 +666,7 @@ def __init__(self, conn, *, table_name=None): # get available ranks... ranks = set() for column, rank in zip(self.columns, taxlist(include_strain=True)): - query = f'SELECT COUNT({column}) FROM {self.table_name} WHERE {column} IS NOT NULL AND {column} != ""' + query = f'SELECT COUNT({column}) FROM taxonomy WHERE {column} IS NOT NULL AND {column} != ""' c.execute(query) cnt, = c.fetchone() if cnt: @@ -667,35 +677,14 @@ def __init__(self, conn, *, table_name=None): @classmethod def load(cls, location): - "load taxonomy information from an existing sqlite3 database" - conn = sqlite_utils.open_sqlite_db(location) - if not conn: - raise ValueError("not a sqlite taxonomy database") - - table_name = None - c = conn.cursor() + "load taxonomy information from a sqlite3 database" + import sqlite3 try: - info = sqlite_utils.get_sourmash_internal(c) - except sqlite3.OperationalError: - info = {} - - if 'SqliteLineage' in info: - if info['SqliteLineage'] != '1.0': - raise IndexNotSupported - - table_name = 'sourmash_taxonomy' - else: - # legacy support for old taxonomy DB, pre sourmash_internal. - try: - c.execute('SELECT * FROM taxonomy LIMIT 1') - table_name = 'taxonomy' - except sqlite3.OperationalError: - pass - - if table_name is None: - raise ValueError("not a sqlite taxonomy database") - - return cls(conn, table_name=table_name) + conn = sqlite3.connect(location) + db = cls(conn) + except sqlite3.DatabaseError: + raise ValueError("not a sqlite database") + return db def _make_tup(self, row): "build a tuple of LineagePairs for this sqlite row" @@ -705,7 +694,7 @@ def _make_tup(self, row): def __getitem__(self, ident): "Retrieve lineage for identifer" c = self.cursor - c.execute(f'SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name} WHERE ident=?', (ident,)) + c.execute('SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM taxonomy WHERE ident=?', (ident,)) # retrieve names list... names = c.fetchone() @@ -726,7 +715,7 @@ def __bool__(self): def __len__(self): "Return number of rows" c = self.conn.cursor() - c.execute(f'SELECT COUNT(DISTINCT ident) FROM {self.table_name}') + c.execute('SELECT COUNT(DISTINCT ident) FROM taxonomy') nrows, = c.fetchone() return nrows @@ -734,7 +723,7 @@ def __iter__(self): "Return all identifiers" # create new cursor so as to allow other operations c = self.conn.cursor() - c.execute(f'SELECT DISTINCT ident FROM {self.table_name}') + c.execute('SELECT DISTINCT ident FROM taxonomy') for ident, in c: yield ident @@ -743,12 +732,11 @@ def items(self): "return all items in the sqlite database" c = self.conn.cursor() - c.execute(f'SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name}') + c.execute('SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM taxonomy') for ident, *names in c: yield ident, self._make_tup(names) - class MultiLineageDB(abc.Mapping): "A wrapper for (dynamically) combining multiple lineage databases." @@ -845,23 +833,15 @@ def save(self, filename_or_fp, file_format): if is_filename: fp.close() - def _save_sqlite(self, filename, *, conn=None): - from sourmash import sqlite_utils - - if conn is None: - db = sqlite3.connect(filename) - else: - assert not filename - db = conn + def _save_sqlite(self, filename): + import sqlite3 + db = sqlite3.connect(filename) cursor = db.cursor() try: - sqlite_utils.add_sourmash_internal(cursor, 'SqliteLineage', '1.0') - - # CTB: could add 'IF NOT EXIST' here; would need tests, too. cursor.execute(""" - CREATE TABLE sourmash_taxonomy ( + CREATE TABLE taxonomy ( ident TEXT NOT NULL, superkingdom TEXT, phylum TEXT, @@ -879,7 +859,7 @@ class TEXT, raise ValueError(f"taxonomy table already exists in '{filename}'") # follow up and create index - cursor.execute("CREATE UNIQUE INDEX sourmash_taxonomy_ident ON sourmash_taxonomy(ident);") + cursor.execute("CREATE UNIQUE INDEX taxonomy_ident ON taxonomy(ident);") for ident, tax in self.items(): x = [ident, *[ t.name for t in tax ]] @@ -888,7 +868,7 @@ class TEXT, while len(x) < 9: x.append('') - cursor.execute('INSERT INTO sourmash_taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x) + cursor.execute('INSERT INTO taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x) db.commit() diff --git a/tests/test-data/tax/test1.gather_ani.csv b/tests/test-data/tax/test1.gather_ani.csv new file mode 100644 index 0000000000..48a09eb199 --- /dev/null +++ b/tests/test-data/tax/test1.gather_ani.csv @@ -0,0 +1,5 @@ +intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename,ksize,scaled,query_nhashes +442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,31,1000,5013970 +390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,31,1000,4571970 +138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,31,1000,4181970 +338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,31,1000,4327970 diff --git a/tests/test_tax.py b/tests/test_tax.py index 1faf6ce19a..3d19cfd740 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -9,9 +9,6 @@ from sourmash.tax import tax_utils from sourmash_tst_utils import SourmashCommandFailed -from sourmash import sqlite_utils -from sourmash.exceptions import IndexNotSupported - ## command line tests def test_run_sourmash_tax(): status, out, err = utils.runscript('sourmash', ['tax'], fail_ok=True) @@ -705,6 +702,18 @@ def test_genome_rank_stdout_0_db(runtmp): assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0' in c.last_result.out + # too stringent of containment threshold: + c.run_sourmash('tax', 'genome', '--gather-csv', g_csv, '--taxonomy-csv', + tax, '--rank', 'species', '--containment-threshold', '1.0') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "WARNING: classifying query test1 at desired rank species does not meet containment threshold 1.0" in c.last_result.err + assert "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0," in c.last_result.out + def test_genome_rank_csv_0(runtmp): # test basic genome - output csv @@ -1366,6 +1375,81 @@ def test_genome_over100percent_error(runtmp): assert "ERROR: The tax summary of query 'test1' is 1.1, which is > 100% of the query!!" in runtmp.last_result.err +def test_genome_ani_threshold_input_errors(runtmp): + c = runtmp + g_csv = utils.get_test_data('tax/test1.gather_ani.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + below_threshold = "-1" + + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash('tax', 'genome', '-g', tax, '--taxonomy-csv', tax, + '--ani-threshold', below_threshold) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert "ERROR: Argument must be >0 and <1" in str(exc.value) + + above_threshold = "1.1" + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', above_threshold) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert "ERROR: Argument must be >0 and <1" in str(exc.value) + + not_a_float = "str" + + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', not_a_float) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert "ERROR: Must be a floating point number" in str(exc.value) + + +def test_genome_ani_threshold(runtmp): + c = runtmp + g_csv = utils.get_test_data('tax/test1.gather_ani.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', "0.95") + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "WARNING: Please run gather with sourmash >= 4.3 to estimate query ANI at rank. Continuing without ANI..." not in c.last_result.err + assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,0.9328896594471843' in c.last_result.out + + # more lax threshold + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', "0.9") + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0' in c.last_result.out + + # too stringent of threshold (using rank) + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', "1.0", '--rank', 'species') + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert "WARNING: classifying query test1 at desired rank species does not meet query ANI/AAI threshold 1.0" in c.last_result.err + assert "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0,0.9247805047263588" in c.last_result.out + + def test_annotate_0(runtmp): # test annotate c = runtmp @@ -1822,31 +1906,3 @@ def test_tax_prepare_3_db_to_csv_empty_ranks_3(runtmp): keep_identifier_versions=False) assert set(db1) == set(db2) assert set(db1) == set(db3) - - -def test_tax_prepare_sqlite_lineage_version(runtmp): - # test bad sourmash_internals version for SqliteLineage - taxcsv = utils.get_test_data('tax/test.taxonomy.csv') - taxout = runtmp.output('out.db') - - runtmp.run_sourmash('tax', 'prepare', '-t', taxcsv, - '-o', taxout, '-F', 'sql') - assert os.path.exists(taxout) - - # set bad version - conn = sqlite_utils.open_sqlite_db(taxout) - c = conn.cursor() - c.execute("UPDATE sourmash_internal SET value='0.9' WHERE key='SqliteLineage'") - - conn.commit() - conn.close() - - with pytest.raises(IndexNotSupported): - db = tax_utils.MultiLineageDB.load([taxout]) - -def test_tax_prepare_sqlite_no_lineage(): - # no lineage table at all - sqldb = utils.get_test_data('sqlite/index.sqldb') - - with pytest.raises(ValueError): - db = tax_utils.MultiLineageDB.load([sqldb]) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 449e26f972..17e457b2b3 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1,6 +1,7 @@ """ Tests for functions in taxonomy submodule. """ + import pytest from os.path import basename @@ -22,9 +23,11 @@ from sourmash.lca.lca_utils import LineagePair # utility functions for testing -def make_mini_gather_results(g_infolist): +def make_mini_gather_results(g_infolist, include_ksize_and_scaled=False): # make mini gather_results min_header = ["query_name", "name", "match_ident", "f_unique_to_query", "query_md5", "query_filename", "f_unique_weighted", "unique_intersect_bp", "remaining_bp"] + if include_ksize_and_scaled: + min_header.extend(['ksize', 'scaled']) gather_results = [] for g_info in g_infolist: inf = dict(zip(min_header, g_info)) @@ -351,30 +354,36 @@ def test_summarize_gather_at_0(): def test_summarize_gather_at_1(): """test two matches, diff f_unique_to_query""" # make mini gather_results - gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40'] - gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.1', '10', '90'] - g_res = make_mini_gather_results([gA,gB]) + ksize=31 + scaled=10 + gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40', ksize, scaled] + gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.1', '10', '90', ksize, scaled] + g_res = make_mini_gather_results([gA,gB], include_ksize_and_scaled=True) # make mini taxonomy gA_tax = ("gA", "a;b;c") gB_tax = ("gB", "a;b;d") taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, estimate_query_ani=True) # superkingdom assert len(sk_sum) == 2 - print("superkingdom summarized gather: ", sk_sum[0]) + print("\nsuperkingdom summarized gather 0: ", sk_sum[0]) assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 0.7 assert sk_sum[0].bp_match_at_rank == 70 + print("superkingdom summarized gather 1: ", sk_sum[1]) assert sk_sum[1].lineage == () assert round(sk_sum[1].fraction, 1) == 0.3 assert sk_sum[1].bp_match_at_rank == 30 + assert sk_sum[0].query_ani_at_rank == 0.9885602934376099 + assert sk_sum[1].query_ani_at_rank == None # phylum - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) - print("phylum summarized gather: ", phy_sum[0]) + phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, estimate_query_ani=False) + print("phylum summarized gather 0: ", phy_sum[0]) + print("phylum summarized gather 1: ", phy_sum[1]) assert len(phy_sum) == 2 assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b')) assert phy_sum[0].fraction == 0.7 @@ -383,8 +392,10 @@ def test_summarize_gather_at_1(): assert phy_sum[1].lineage == () assert round(phy_sum[1].fraction, 1) == 0.3 assert phy_sum[1].bp_match_at_rank == 30 + assert phy_sum[0].query_ani_at_rank == None + assert phy_sum[1].query_ani_at_rank == None # class - cl_sum, _ = summarize_gather_at("class", taxD, g_res) + cl_sum, _ = summarize_gather_at("class", taxD, g_res, estimate_query_ani=True) assert len(cl_sum) == 3 print("class summarized gather: ", cl_sum) assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -393,6 +404,7 @@ def test_summarize_gather_at_1(): assert cl_sum[0].fraction == 0.6 assert cl_sum[0].f_weighted_at_rank == 0.5 assert cl_sum[0].bp_match_at_rank == 60 + assert cl_sum[0].query_ani_at_rank == 0.9836567776983505 assert cl_sum[1].rank == 'class' assert cl_sum[1].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -401,8 +413,10 @@ def test_summarize_gather_at_1(): assert cl_sum[1].fraction == 0.1 assert cl_sum[1].f_weighted_at_rank == 0.1 assert cl_sum[1].bp_match_at_rank == 10 + assert cl_sum[1].query_ani_at_rank == 0.9284145445194744 assert cl_sum[2].lineage == () assert round(cl_sum[2].fraction, 1) == 0.3 + assert cl_sum[2].query_ani_at_rank == None def test_summarize_gather_at_perfect_match(): @@ -532,32 +546,38 @@ def test_summarize_gather_at_missing_fail(): def test_summarize_gather_at_best_only_0(): """test two matches, diff f_unique_to_query""" # make mini gather_results - gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40'] - gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.5', '10', '90'] - g_res = make_mini_gather_results([gA,gB]) + ksize =31 + scaled=10 + gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40', ksize, scaled] + gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.5', '10', '90', ksize, scaled] + g_res = make_mini_gather_results([gA,gB],include_ksize_and_scaled=True) # make mini taxonomy gA_tax = ("gA", "a;b;c") gB_tax = ("gB", "a;b;d") taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) + sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, best_only=True,estimate_query_ani=True) # superkingdom assert len(sk_sum) == 1 print("superkingdom summarized gather: ", sk_sum[0]) assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 0.7 assert sk_sum[0].bp_match_at_rank == 70 + print("superk ANI:",sk_sum[0].query_ani_at_rank) + assert sk_sum[0].query_ani_at_rank == 0.9885602934376099 # phylum - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True) + phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True,estimate_query_ani=True) print("phylum summarized gather: ", phy_sum[0]) assert len(phy_sum) == 1 assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b')) assert phy_sum[0].fraction == 0.7 assert phy_sum[0].bp_match_at_rank == 70 + print("phy ANI:",phy_sum[0].query_ani_at_rank) + assert phy_sum[0].query_ani_at_rank == 0.9885602934376099 # class - cl_sum, _ = summarize_gather_at("class", taxD, g_res, best_only=True) + cl_sum, _ = summarize_gather_at("class", taxD, g_res, best_only=True, estimate_query_ani=True) assert len(cl_sum) == 1 print("class summarized gather: ", cl_sum) assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -565,6 +585,8 @@ def test_summarize_gather_at_best_only_0(): LineagePair(rank='class', name='c')) assert cl_sum[0].fraction == 0.6 assert cl_sum[0].bp_match_at_rank == 60 + print("cl ANI:",cl_sum[0].query_ani_at_rank) + assert cl_sum[0].query_ani_at_rank == 0.9836567776983505 def test_summarize_gather_at_best_only_equal_choose_first(): @@ -597,12 +619,14 @@ def test_write_summary_csv(runtmp): sum_gather = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.0, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, - lineage=(LineagePair(rank='superkingdom', name='a'),))], + lineage=(LineagePair(rank='superkingdom', name='a'),), + query_ani_at_rank=None)], 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=1.0, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')))]} + LineagePair(rank='phylum', name='b')), + query_ani_at_rank=None)]} outs= runtmp.output("outsum.csv") with open(outs, 'w') as out_fp: @@ -610,9 +634,9 @@ def test_write_summary_csv(runtmp): sr = [x.rstrip().split(',') for x in open(outs, 'r')] print("gather_summary_results_from_file: \n", sr) - assert ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank'] == sr[0] - assert ['queryA', 'superkingdom', '1.0', 'a', 'queryA_md5', 'queryA.sig', '1.0', '100'] == sr[1] - assert ['queryA', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100'] == sr[2] + assert ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank'] == sr[0] + assert ['queryA', 'superkingdom', '1.0', 'a', 'queryA_md5', 'queryA.sig', '1.0', '100', ''] == sr[1] + assert ['queryA', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100',''] == sr[2] def test_write_classification(runtmp): @@ -620,7 +644,8 @@ def test_write_classification(runtmp): classif = ClassificationResult('queryA', 'match', 'phylum', 1.0, (LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')), - 'queryA_md5', 'queryA.sig', 1.0, 100) + 'queryA_md5', 'queryA.sig', 1.0, 100, + query_ani_at_rank=None) classification = {'phylum': [classif]} @@ -630,8 +655,8 @@ def test_write_classification(runtmp): sr = [x.rstrip().split(',') for x in open(outs, 'r')] print("gather_classification_results_from_file: \n", sr) - assert ['query_name', 'status', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank'] == sr[0] - assert ['queryA', 'match', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100'] == sr[1] + assert ['query_name', 'status', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank'] == sr[0] + assert ['queryA', 'match', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100', ''] == sr[1] def test_make_krona_header_0(): @@ -816,21 +841,25 @@ def test_combine_sumgather_csvs_by_lineage(runtmp): sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, - lineage=(LineagePair(rank='superkingdom', name='a'),))], + lineage=(LineagePair(rank='superkingdom', name='a'),), + query_ani_at_rank=None)], 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')))]} + LineagePair(rank='phylum', name='b')), + query_ani_at_rank=None)]} sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(LineagePair(rank='superkingdom', name='a'),))], + lineage=(LineagePair(rank='superkingdom', name='a'),), + query_ani_at_rank=None)], 'phylum': [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='c')))]} + LineagePair(rank='phylum', name='c')), + query_ani_at_rank=None)]} # write summarized gather results csvs sg1= runtmp.output("sample1.csv") @@ -903,21 +932,25 @@ def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp): sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, - lineage=(LineagePair(rank='superkingdom', name='a'),))], + lineage=(LineagePair(rank='superkingdom', name='a'),), + query_ani_at_rank=None)], 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')))]} + LineagePair(rank='phylum', name='b')), + query_ani_at_rank=None)]} sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(LineagePair(rank='superkingdom', name='a'),))], + lineage=(LineagePair(rank='superkingdom', name='a'),), + query_ani_at_rank=None)], 'phylum': [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='c')))]} + LineagePair(rank='phylum', name='c')), + query_ani_at_rank=None)]} # write summarized gather results csvs sg1= runtmp.output("sample1.csv") @@ -967,17 +1000,6 @@ def test_tax_multi_load_files(runtmp): MultiLineageDB.load([runtmp.output('no-such-file')]) -def test_tax_sql_load_new_file(runtmp): - # test loading a newer-format sql file with sourmash_internals table - taxonomy_db = utils.get_test_data('sqlite/test.taxonomy.db') - - db = MultiLineageDB.load([taxonomy_db]) - print(list(db.keys())) - assert len(db) == 6 - assert 'strain' not in db.available_ranks - assert db['GCF_001881345'][0].rank == 'superkingdom' - - def test_tax_multi_load_files_shadowed(runtmp): # test loading various good and bad files taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') From 4e2d43bb1f3e480a44704e8c04c8f96bd0fa2cdf Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Wed, 27 Apr 2022 11:46:01 -0700 Subject: [PATCH 2/9] tax threshold arg from #1788 --- src/sourmash/cli/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index 1725518747..2063fd09de 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -67,10 +67,14 @@ def range_limited_float_type(arg): return f -def add_tax_threshold_arg(parser, default=0.1): +def add_tax_threshold_arg(parser, containment_default=0.1, ani_default=None): parser.add_argument( - '--containment-threshold', default=default, type=range_limited_float_type, - help=f'minimum containment threshold for classification; default={default}' + '--containment-threshold', default=containment_default, type=range_limited_float_type, + help=f'minimum containment threshold for classification; default={containment_default}', + ) + parser.add_argument( + '--ani-threshold', '--aai-threshold', default=ani_default, type=range_limited_float_type, + help=f'minimum ANI threshold (nucleotide gather) or AAI threshold (protein gather) for classification; default={ani_default}', ) From e29b5ddec53df1a0c919cf79e967df07a3bd0a25 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Wed, 27 Apr 2022 14:58:59 -0700 Subject: [PATCH 3/9] add back sql changes from latest --- src/sourmash/tax/tax_utils.py | 112 ++++++++++++++++++++++++---------- tests/test_tax.py | 31 ++++++++++ tests/test_tax_utils.py | 11 ++++ 3 files changed, 123 insertions(+), 31 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 0c69eeaf55..6ee0787f3b 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -5,8 +5,14 @@ import csv from collections import namedtuple, defaultdict from collections import abc + +from sourmash import sqlite_utils +from sourmash.exceptions import IndexNotSupported from sourmash.distance_utils import containment_to_distance +import sqlite3 + + __all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs', 'load_gather_results', 'check_and_load_gather_csvs', 'find_match_lineage', 'summarize_gather_at', @@ -183,7 +189,8 @@ def find_match_lineage(match_ident, tax_assign, *, skip_idents = [], def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], keep_full_identifiers=False, keep_identifier_versions=False, best_only=False, - seen_perfect=set(), estimate_query_ani=False): + seen_perfect=set(), + estimate_query_ani=False): """ Summarize gather results at specified taxonomic rank """ @@ -194,6 +201,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], sum_uniq_bp = defaultdict(lambda: defaultdict(float)) query_info = {} ksize,scaled,query_nhashes=None,None,None + for row in gather_results: # get essential gather info query_name = row['query_name'] @@ -202,6 +210,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], unique_intersect_bp = int(row['unique_intersect_bp']) query_md5 = row['query_md5'] query_filename = row['query_filename'] + # get query_bp if query_name not in query_info.keys(): #REMOVING THIS AFFECTS GATHER RESULTS!!! BUT query bp should always be same for same query? bug? if "query_nhashes" in row.keys(): query_nhashes = int(row["query_nhashes"]) @@ -209,17 +218,18 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], query_bp = int(row["query_bp"]) else: query_bp = unique_intersect_bp + int(row['remaining_bp']) + # store query info query_info[query_name] = QueryInfo(query_md5=query_md5, query_filename=query_filename, query_bp=query_bp, query_hashes = query_nhashes) - + if estimate_query_ani and (not ksize or not scaled): # just need to set these once. BUT, if we have these, should we check for compatibility when loading the gather file? - if "ksize" in row.keys(): # ksize and scaled were added to gather results in same PR + if "ksize" in row.keys(): ksize = int(row['ksize']) scaled = int(row['scaled']) else: estimate_query_ani=False - notify("WARNING: Please run gather with sourmash >= 4.3 to estimate query ANI at rank. Continuing without ANI...") - + notify("WARNING: Please run gather with sourmash >= 4.4 to estimate query ANI at rank. Continuing without ANI...") + match_ident = row['name'] # 100% match? are we looking at something in the database? @@ -241,13 +251,13 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], # summarize at rank! lineage = pop_to_rank(lineage, rank) + #assert lineage[-1].rank == rank, (rank, lineage[-1]) assert lineage[-1].rank == rank, lineage[-1] # record info sum_uniq_to_query[query_name][lineage] += f_unique_to_query sum_uniq_weighted[query_name][lineage] += f_uniq_weighted sum_uniq_bp[query_name][lineage] += unique_intersect_bp - # sort and store each as SummarizedGatherResult sum_uniq_to_query_sorted = [] for query_name, lineage_weights in sum_uniq_to_query.items(): @@ -265,8 +275,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], bp_intersect_at_rank = sum_uniq_bp[query_name][lineage] if estimate_query_ani: query_ani = containment_to_distance(fraction, ksize, scaled, - n_unique_kmers= qInfo.query_hashes, sequence_len_bp= qInfo.query_bp, - return_identity=True)[0] + n_unique_kmers= qInfo.query_hashes, sequence_len_bp= qInfo.query_bp).ani sres = SummarizedGatherResult(query_name, rank, fraction, lineage, qInfo.query_md5, qInfo.query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) @@ -287,15 +296,14 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], total_bp_classified += bp_intersect_at_rank if estimate_query_ani: query_ani = containment_to_distance(fraction, ksize, scaled, - n_unique_kmers=qInfo.query_hashes, sequence_len_bp=qInfo.query_bp, - return_identity=True)[0] + n_unique_kmers=qInfo.query_hashes, sequence_len_bp=qInfo.query_bp).ani sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) # record unclassified lineage = () - query_ani=None + query_ani = None fraction = 1.0 - total_f_classified if fraction > 0: f_weighted_at_rank = 1.0 - total_f_weighted @@ -650,15 +658,27 @@ def load(cls, filename, *, delimiter=',', force=False, class LineageDB_Sqlite(abc.Mapping): """ - A LineageDB based on a sqlite3 database with a 'taxonomy' table. + A LineageDB based on a sqlite3 database with a 'sourmash_taxonomy' table. """ # NOTE: 'order' is a reserved name in sql, so we have to use 'order_'. columns = ('superkingdom', 'phylum', 'order_', 'class', 'family', 'genus', 'species', 'strain') + table_name = 'sourmash_taxonomy' - def __init__(self, conn): + def __init__(self, conn, *, table_name=None): self.conn = conn + # provide for legacy support for pre-sourmash_internal days... + if table_name is not None: + self.table_name = table_name + + # check that the right table is there. + c = conn.cursor() + try: + c.execute(f'SELECT * FROM {self.table_name} LIMIT 1') + except (sqlite3.DatabaseError, sqlite3.OperationalError): + raise ValueError("not a taxonomy database") + # check: can we do a 'select' on the right table? self.__len__() c = conn.cursor() @@ -666,7 +686,7 @@ def __init__(self, conn): # get available ranks... ranks = set() for column, rank in zip(self.columns, taxlist(include_strain=True)): - query = f'SELECT COUNT({column}) FROM taxonomy WHERE {column} IS NOT NULL AND {column} != ""' + query = f'SELECT COUNT({column}) FROM {self.table_name} WHERE {column} IS NOT NULL AND {column} != ""' c.execute(query) cnt, = c.fetchone() if cnt: @@ -677,14 +697,35 @@ def __init__(self, conn): @classmethod def load(cls, location): - "load taxonomy information from a sqlite3 database" - import sqlite3 + "load taxonomy information from an existing sqlite3 database" + conn = sqlite_utils.open_sqlite_db(location) + if not conn: + raise ValueError("not a sqlite taxonomy database") + + table_name = None + c = conn.cursor() try: - conn = sqlite3.connect(location) - db = cls(conn) - except sqlite3.DatabaseError: - raise ValueError("not a sqlite database") - return db + info = sqlite_utils.get_sourmash_internal(c) + except sqlite3.OperationalError: + info = {} + + if 'SqliteLineage' in info: + if info['SqliteLineage'] != '1.0': + raise IndexNotSupported + + table_name = 'sourmash_taxonomy' + else: + # legacy support for old taxonomy DB, pre sourmash_internal. + try: + c.execute('SELECT * FROM taxonomy LIMIT 1') + table_name = 'taxonomy' + except sqlite3.OperationalError: + pass + + if table_name is None: + raise ValueError("not a sqlite taxonomy database") + + return cls(conn, table_name=table_name) def _make_tup(self, row): "build a tuple of LineagePairs for this sqlite row" @@ -694,7 +735,7 @@ def _make_tup(self, row): def __getitem__(self, ident): "Retrieve lineage for identifer" c = self.cursor - c.execute('SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM taxonomy WHERE ident=?', (ident,)) + c.execute(f'SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name} WHERE ident=?', (ident,)) # retrieve names list... names = c.fetchone() @@ -715,7 +756,7 @@ def __bool__(self): def __len__(self): "Return number of rows" c = self.conn.cursor() - c.execute('SELECT COUNT(DISTINCT ident) FROM taxonomy') + c.execute(f'SELECT COUNT(DISTINCT ident) FROM {self.table_name}') nrows, = c.fetchone() return nrows @@ -723,7 +764,7 @@ def __iter__(self): "Return all identifiers" # create new cursor so as to allow other operations c = self.conn.cursor() - c.execute('SELECT DISTINCT ident FROM taxonomy') + c.execute(f'SELECT DISTINCT ident FROM {self.table_name}') for ident, in c: yield ident @@ -732,11 +773,12 @@ def items(self): "return all items in the sqlite database" c = self.conn.cursor() - c.execute('SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM taxonomy') + c.execute(f'SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name}') for ident, *names in c: yield ident, self._make_tup(names) + class MultiLineageDB(abc.Mapping): "A wrapper for (dynamically) combining multiple lineage databases." @@ -833,15 +875,23 @@ def save(self, filename_or_fp, file_format): if is_filename: fp.close() - def _save_sqlite(self, filename): - import sqlite3 - db = sqlite3.connect(filename) + def _save_sqlite(self, filename, *, conn=None): + from sourmash import sqlite_utils + + if conn is None: + db = sqlite3.connect(filename) + else: + assert not filename + db = conn cursor = db.cursor() try: + sqlite_utils.add_sourmash_internal(cursor, 'SqliteLineage', '1.0') + + # CTB: could add 'IF NOT EXIST' here; would need tests, too. cursor.execute(""" - CREATE TABLE taxonomy ( + CREATE TABLE sourmash_taxonomy ( ident TEXT NOT NULL, superkingdom TEXT, phylum TEXT, @@ -859,7 +909,7 @@ class TEXT, raise ValueError(f"taxonomy table already exists in '{filename}'") # follow up and create index - cursor.execute("CREATE UNIQUE INDEX taxonomy_ident ON taxonomy(ident);") + cursor.execute("CREATE UNIQUE INDEX sourmash_taxonomy_ident ON sourmash_taxonomy(ident);") for ident, tax in self.items(): x = [ident, *[ t.name for t in tax ]] @@ -868,7 +918,7 @@ class TEXT, while len(x) < 9: x.append('') - cursor.execute('INSERT INTO taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x) + cursor.execute('INSERT INTO sourmash_taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x) db.commit() diff --git a/tests/test_tax.py b/tests/test_tax.py index 3d19cfd740..8cf2218ef7 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -9,6 +9,9 @@ from sourmash.tax import tax_utils from sourmash_tst_utils import SourmashCommandFailed +from sourmash import sqlite_utils +from sourmash.exceptions import IndexNotSupported + ## command line tests def test_run_sourmash_tax(): status, out, err = utils.runscript('sourmash', ['tax'], fail_ok=True) @@ -1906,3 +1909,31 @@ def test_tax_prepare_3_db_to_csv_empty_ranks_3(runtmp): keep_identifier_versions=False) assert set(db1) == set(db2) assert set(db1) == set(db3) + + +def test_tax_prepare_sqlite_lineage_version(runtmp): + # test bad sourmash_internals version for SqliteLineage + taxcsv = utils.get_test_data('tax/test.taxonomy.csv') + taxout = runtmp.output('out.db') + + runtmp.run_sourmash('tax', 'prepare', '-t', taxcsv, + '-o', taxout, '-F', 'sql') + assert os.path.exists(taxout) + + # set bad version + conn = sqlite_utils.open_sqlite_db(taxout) + c = conn.cursor() + c.execute("UPDATE sourmash_internal SET value='0.9' WHERE key='SqliteLineage'") + + conn.commit() + conn.close() + + with pytest.raises(IndexNotSupported): + db = tax_utils.MultiLineageDB.load([taxout]) + +def test_tax_prepare_sqlite_no_lineage(): + # no lineage table at all + sqldb = utils.get_test_data('sqlite/index.sqldb') + + with pytest.raises(ValueError): + db = tax_utils.MultiLineageDB.load([sqldb]) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 17e457b2b3..e0c9001bfd 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1000,6 +1000,17 @@ def test_tax_multi_load_files(runtmp): MultiLineageDB.load([runtmp.output('no-such-file')]) +def test_tax_sql_load_new_file(runtmp): + # test loading a newer-format sql file with sourmash_internals table + taxonomy_db = utils.get_test_data('sqlite/test.taxonomy.db') + + db = MultiLineageDB.load([taxonomy_db]) + print(list(db.keys())) + assert len(db) == 6 + assert 'strain' not in db.available_ranks + assert db['GCF_001881345'][0].rank == 'superkingdom' + + def test_tax_multi_load_files_shadowed(runtmp): # test loading various good and bad files taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') From 428cefb1ab03f608b73f213e7933a5e171b464ea Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Wed, 27 Apr 2022 15:25:16 -0700 Subject: [PATCH 4/9] make sure we can still use tax without ANI col --- src/sourmash/tax/__main__.py | 2 +- src/sourmash/tax/tax_utils.py | 1 - tests/test_tax.py | 20 +++++++++++++++++++- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 1cee174eb1..9ba282eac6 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -227,7 +227,7 @@ def genome(args): status = 'nomatch' if sg.query_name in matched_queries: continue - if args.ani_threshold and sg.query_ani_at_rank >= args.ani_threshold: + if sg.query_ani_at_rank is not None and args.ani_threshold and sg.query_ani_at_rank >= args.ani_threshold: status="match" elif sg.fraction >= args.containment_threshold: status = "match" diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 6ee0787f3b..d79a727405 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -251,7 +251,6 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], # summarize at rank! lineage = pop_to_rank(lineage, rank) - #assert lineage[-1].rank == rank, (rank, lineage[-1]) assert lineage[-1].rank == rank, lineage[-1] # record info sum_uniq_to_query[query_name][lineage] += f_unique_to_query diff --git a/tests/test_tax.py b/tests/test_tax.py index 8cf2218ef7..3dcb2dbd1a 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -1428,7 +1428,7 @@ def test_genome_ani_threshold(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert "WARNING: Please run gather with sourmash >= 4.3 to estimate query ANI at rank. Continuing without ANI..." not in c.last_result.err + assert "WARNING: Please run gather with sourmash >= 4.4 to estimate query ANI at rank. Continuing without ANI..." not in c.last_result.err assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,0.9328896594471843' in c.last_result.out @@ -1453,6 +1453,24 @@ def test_genome_ani_threshold(runtmp): assert "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0,0.9247805047263588" in c.last_result.out +def test_genome_ani_oldgather(runtmp): + c = runtmp + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', "0.95") + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "WARNING: Please run gather with sourmash >= 4.4 to estimate query ANI at rank. Continuing without ANI..." in c.last_result.err + assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,' in c.last_result.out + + def test_annotate_0(runtmp): # test annotate c = runtmp From cd4fbfcf7021bb5ff5d911d738d4e0d5dea8a40d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Wed, 27 Apr 2022 15:46:56 -0700 Subject: [PATCH 5/9] dont warn about ignoring ANI more than once --- src/sourmash/tax/__main__.py | 13 +++++----- src/sourmash/tax/tax_utils.py | 2 +- tests/test_tax.py | 11 ++++++++ tests/test_tax_utils.py | 48 +++++++++++++++++------------------ 4 files changed, 43 insertions(+), 31 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 9ba282eac6..6bdf1b119f 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -99,7 +99,7 @@ def metagenome(args): seen_perfect = set() for rank in sourmash.lca.taxlist(include_strain=False): try: - summarized_gather[rank], seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, + summarized_gather[rank], seen_perfect, _ = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, seen_perfect = seen_perfect) @@ -179,9 +179,10 @@ def genome(args): sys.exit(-1) # if --rank is specified, classify to that rank + estimate_query_ani = True if args.rank: try: - best_at_rank, seen_perfect = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed, + best_at_rank, seen_perfect, estimate_query_ani = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, best_only=True, seen_perfect=seen_perfect, estimate_query_ani=True) @@ -215,10 +216,10 @@ def genome(args): for rank in tax_utils.ascending_taxlist(include_strain=False): # gets best_at_rank for all queries in this gather_csv try: - best_at_rank, seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions = args.keep_identifier_versions, - best_only=True, seen_perfect=seen_perfect, estimate_query_ani=True) + best_at_rank, seen_perfect, estimate_query_ani = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions = args.keep_identifier_versions, + best_only=True, seen_perfect=seen_perfect, estimate_query_ani=estimate_query_ani) except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index d79a727405..e6aaff9e4b 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -311,7 +311,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) - return sum_uniq_to_query_sorted, seen_perfect + return sum_uniq_to_query_sorted, seen_perfect, estimate_query_ani def find_missing_identities(gather_results, tax_assign): diff --git a/tests/test_tax.py b/tests/test_tax.py index 3dcb2dbd1a..f900720798 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -1454,10 +1454,21 @@ def test_genome_ani_threshold(runtmp): def test_genome_ani_oldgather(runtmp): + # Ignore ANI if we don't have the information we need to estimate it c = runtmp g_csv = utils.get_test_data('tax/test1.gather.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,' in c.last_result.out + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--ani-threshold', "0.95") diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index e0c9001bfd..febae40076 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -302,7 +302,7 @@ def test_summarize_gather_at_0(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) # superkingdom assert len(sk_sum) == 1 @@ -317,7 +317,7 @@ def test_summarize_gather_at_0(): assert sk_sum[0].bp_match_at_rank == 100 # phylum - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) + phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res) print("phylum summarized gather: ", phy_sum[0]) assert len(phy_sum) == 1 assert phy_sum[0].query_name == "queryA" @@ -329,7 +329,7 @@ def test_summarize_gather_at_0(): assert phy_sum[0].f_weighted_at_rank == 1.0 assert phy_sum[0].bp_match_at_rank == 100 # class - cl_sum, _ = summarize_gather_at("class", taxD, g_res) + cl_sum, _, _ = summarize_gather_at("class", taxD, g_res) assert len(cl_sum) == 2 print("class summarized gather: ", cl_sum) assert cl_sum[0].query_name == "queryA" @@ -365,7 +365,7 @@ def test_summarize_gather_at_1(): gB_tax = ("gB", "a;b;d") taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, estimate_query_ani=True) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res, estimate_query_ani=True) # superkingdom assert len(sk_sum) == 2 @@ -381,7 +381,7 @@ def test_summarize_gather_at_1(): assert sk_sum[1].query_ani_at_rank == None # phylum - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, estimate_query_ani=False) + phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, estimate_query_ani=False) print("phylum summarized gather 0: ", phy_sum[0]) print("phylum summarized gather 1: ", phy_sum[1]) assert len(phy_sum) == 2 @@ -395,7 +395,7 @@ def test_summarize_gather_at_1(): assert phy_sum[0].query_ani_at_rank == None assert phy_sum[1].query_ani_at_rank == None # class - cl_sum, _ = summarize_gather_at("class", taxD, g_res, estimate_query_ani=True) + cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, estimate_query_ani=True) assert len(cl_sum) == 3 print("class summarized gather: ", cl_sum) assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -432,7 +432,7 @@ def test_summarize_gather_at_perfect_match(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) # superkingdom assert len(sk_sum) == 1 print("superkingdom summarized gather: ", sk_sum[0]) @@ -454,16 +454,16 @@ def test_summarize_gather_at_over100percent_f_unique_to_query(): # run summarize_gather_at and check results! with pytest.raises(ValueError) as exc: - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) assert "The tax summary of query 'queryA' is 1.1, which is > 100% of the query!!" in str(exc) # phylum with pytest.raises(ValueError) as exc: - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) + phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res) assert "The tax summary of query 'queryA' is 1.1, which is > 100% of the query!!" in str(exc) # class - cl_sum, _ = summarize_gather_at("class", taxD, g_res) + cl_sum, _, _ = summarize_gather_at("class", taxD, g_res) assert len(cl_sum) == 2 print("class summarized gather: ", cl_sum) assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -491,7 +491,7 @@ def test_summarize_gather_at_missing_ignore(): taxD = make_mini_taxonomy([gA_tax]) # run summarize_gather_at and check results! - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, skip_idents=['gB']) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res, skip_idents=['gB']) # superkingdom assert len(sk_sum) == 2 print("superkingdom summarized gather: ", sk_sum[0]) @@ -503,7 +503,7 @@ def test_summarize_gather_at_missing_ignore(): assert sk_sum[1].bp_match_at_rank == 50 # phylum - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB']) + phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB']) print("phylum summarized gather: ", phy_sum[0]) assert len(phy_sum) == 2 assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b')) @@ -513,7 +513,7 @@ def test_summarize_gather_at_missing_ignore(): assert phy_sum[1].fraction == 0.5 assert phy_sum[1].bp_match_at_rank == 50 # class - cl_sum, _ = summarize_gather_at("class", taxD, g_res, skip_idents=['gB']) + cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, skip_idents=['gB']) assert len(cl_sum) == 2 print("class summarized gather: ", cl_sum) assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -539,7 +539,7 @@ def test_summarize_gather_at_missing_fail(): # run summarize_gather_at and check results! with pytest.raises(ValueError) as exc: - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) assert "ident gB is not in the taxonomy database." in str(exc.value) @@ -557,7 +557,7 @@ def test_summarize_gather_at_best_only_0(): gB_tax = ("gB", "a;b;d") taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, best_only=True,estimate_query_ani=True) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res, best_only=True,estimate_query_ani=True) # superkingdom assert len(sk_sum) == 1 print("superkingdom summarized gather: ", sk_sum[0]) @@ -568,7 +568,7 @@ def test_summarize_gather_at_best_only_0(): assert sk_sum[0].query_ani_at_rank == 0.9885602934376099 # phylum - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True,estimate_query_ani=True) + phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True,estimate_query_ani=True) print("phylum summarized gather: ", phy_sum[0]) assert len(phy_sum) == 1 assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b')) @@ -577,7 +577,7 @@ def test_summarize_gather_at_best_only_0(): print("phy ANI:",phy_sum[0].query_ani_at_rank) assert phy_sum[0].query_ani_at_rank == 0.9885602934376099 # class - cl_sum, _ = summarize_gather_at("class", taxD, g_res, best_only=True, estimate_query_ani=True) + cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, best_only=True, estimate_query_ani=True) assert len(cl_sum) == 1 print("class summarized gather: ", cl_sum) assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -603,7 +603,7 @@ def test_summarize_gather_at_best_only_equal_choose_first(): # run summarize_gather_at and check results! # class - cl_sum, _ = summarize_gather_at("class", taxD, g_res, best_only=True) + cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, best_only=True) assert len(cl_sum) == 1 print("class summarized gather: ", cl_sum) assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -697,7 +697,7 @@ def test_aggregate_by_lineage_at_rank_by_query(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # aggregate by lineage at rank - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) print("superkingdom summarized gather results:", sk_sum) assert len(sk_sum) ==4 assert sk_sum[0].query_name == "queryA" @@ -726,7 +726,7 @@ def test_aggregate_by_lineage_at_rank_by_query(): assert num_queries == 2 assert query_names == ['queryA', 'queryB'] - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) + phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res) print("phylum summary:", phy_sum, ']\n') phy_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank(phy_sum, by_query=True) print("phylum lineage summary:", phy_lin_sum, '\n') @@ -750,13 +750,13 @@ def test_format_for_krona_0(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # check krona format and check results! - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) print("superkingdom summarized gather results:", sk_sum) krona_res = format_for_krona("superkingdom", {"superkingdom": sk_sum}) print("krona_res: ", krona_res) assert krona_res == [(1.0, 'a')] - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) + phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res) krona_res = format_for_krona("phylum", {"phylum": phy_sum}) print("krona_res: ", krona_res) assert krona_res == [(1.0, 'a', 'b')] @@ -778,7 +778,7 @@ def test_format_for_krona_1(): sum_res = {} #for rank in lca_utils.taxlist(include_strain=False): for rank in ['superkingdom', 'phylum', 'class']: - sum_res[rank], _ = summarize_gather_at(rank, taxD, g_res) + sum_res[rank], _, _ = summarize_gather_at(rank, taxD, g_res) print('summarized gather: ', sum_res) # check krona format sk_krona = format_for_krona("superkingdom", sum_res) @@ -808,7 +808,7 @@ def test_format_for_krona_best_only(): sum_res = {} #for rank in lca_utils.taxlist(include_strain=False): for rank in ['superkingdom', 'phylum', 'class']: - sum_res[rank], _ = summarize_gather_at(rank, taxD, g_res, best_only=True) + sum_res[rank], _, _ = summarize_gather_at(rank, taxD, g_res, best_only=True) print('summarized gather: ', sum_res) # check krona format sk_krona = format_for_krona("superkingdom", sum_res) From d27be4aeb1c57ad9077faa3ae52983a1632a79ad Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Fri, 29 Apr 2022 17:41:34 -0700 Subject: [PATCH 6/9] check query stats --- tests/test_search.py | 2 ++ tests/test_sourmash.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/test_search.py b/tests/test_search.py index 0d765e0f96..0efa267a20 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -482,6 +482,8 @@ def test_GatherResult(): assert res.match_md5 == ss4763.md5sum() assert res.query_n_hashes == len(ss47.minhash) assert res.match_n_hashes == len(ss4763.minhash) + assert res.query_bp == ss47.minhash.covered_bp + assert res.match_bp == ss4763.minhash.covered_bp assert res.md5 == ss4763.md5sum() assert res.name == ss4763.name assert res.match_filename == ss4763.filename diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 4dae2e4858..94cd6baaf9 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -5660,6 +5660,8 @@ def test_gather_ani_csv_estimate_ci(runtmp, linear_gather, prefetch_gather): assert row['query_name'] == 'tr1 4' assert row['query_md5'] == 'c9d5a795' assert row['query_bp'] == '910' + assert row['query_n_hashes'] == '91' + assert row['query_abundance'] == 'False' assert row['query_containment_ani']== '1.0' assert row['query_containment_ani_low']== '' assert row['query_containment_ani_high']== '' From fb3729ff18419d9cec2e99640273c46d7a41801e Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Sat, 23 Jul 2022 14:55:40 -0400 Subject: [PATCH 7/9] fix gather test --- tests/test_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_search.py b/tests/test_search.py index 7c9e43f1f4..1863c03bb2 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -481,8 +481,8 @@ def test_GatherResult(): assert res.match_md5 == ss4763.md5sum() assert res.query_n_hashes == len(ss47.minhash) assert res.match_n_hashes == len(ss4763.minhash) - assert res.query_bp == ss47.minhash.covered_bp - assert res.match_bp == ss4763.minhash.covered_bp + assert res.query_bp == ss47.minhash.unique_dataset_hashes + assert res.match_bp == ss4763.minhash.unique_dataset_hashes assert res.md5 == ss4763.md5sum() assert res.name == ss4763.name assert res.match_filename == ss4763.filename From f78eef3c249e9393ee5cefbfbe8a1f3f61ea7ab1 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Sat, 23 Jul 2022 20:12:10 -0400 Subject: [PATCH 8/9] add sentence on ani thresh --- doc/command-line.md | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index 99619da023..37438af488 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -468,10 +468,9 @@ The sourmash `tax` or `taxonomy` commands integrate taxonomic taxonomic rank. For example, if the gather results for a metagenome include results for 30 different strains of a given species, we can sum the fraction uniquely matched to each strain to obtain the fraction - uniquely matched to this species. Note that this summarization can - also take into account abundance weighting; see - [classifying signatures](classifying-signatures.md) for more - information. + uniquely matched to this species. Alternatively, taxonomic summarization + can take into account abundance weighting; see + [classifying signatures](classifying-signatures.md) for more information. As with all reference-based analysis, results can be affected by the completeness of the reference database. However, summarizing taxonomic @@ -589,12 +588,20 @@ To produce multiple output types from the same command, add the types into the ### `sourmash tax genome` - classify a genome using `gather` results `sourmash tax genome` reports likely classification for each query, - based on `gather` matches. By default, classification requires at least 10% of - the query to be matched. Thus, if 10% of the query was matched to a species, the + based on `gather` matches. Since `gather` matches are non-overlapping + + By default, classification requires at least 10% of the query to be matched. + Thus, if 10% of the query was matched to a species, the species-level classification can be reported. However, if 7% of the query was matched to one species, and an additional 5% matched to a different species in the same genus, the genus-level classification will be reported. +`sourmash tax genome` can use an ANI threshold (`--ani-threshold`) instead of a + containment threshold. This works the same way as the containment threshold + (and indeed, is using the same underlying information). Note that for DNA k-mers, + k=21 ANI is most similar to alignment-based ANI values, and ANI values should only + be compared if they were generated using the same ksize. + Optionally, `genome` can instead report classifications at a desired `rank`, regardless of match threshold (`--rank` argument, e.g. `--rank species`). From 6ee64463f239c54332653f646b907830e0caeade Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Sun, 24 Jul 2022 09:08:46 -0400 Subject: [PATCH 9/9] apply sugg from code review --- doc/command-line.md | 12 +++++------- src/sourmash/tax/tax_utils.py | 4 ++-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index 37438af488..3748f0f5c9 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -588,13 +588,11 @@ To produce multiple output types from the same command, add the types into the ### `sourmash tax genome` - classify a genome using `gather` results `sourmash tax genome` reports likely classification for each query, - based on `gather` matches. Since `gather` matches are non-overlapping - - By default, classification requires at least 10% of the query to be matched. - Thus, if 10% of the query was matched to a species, the - species-level classification can be reported. However, if 7% of the query was - matched to one species, and an additional 5% matched to a different species in - the same genus, the genus-level classification will be reported. + based on `gather` matches. By default, classification requires at least 10% + of the query to be matched. Thus, if 10% of the query was matched to a species, + the species-level classification can be reported. However, if 7% of the query + was matched to one species, and an additional 5% matched to a different species + in the same genus, the genus-level classification will be reported. `sourmash tax genome` can use an ANI threshold (`--ani-threshold`) instead of a containment threshold. This works the same way as the containment threshold diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index e6aaff9e4b..473a7fcce3 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -200,7 +200,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], sum_uniq_to_query = defaultdict(lambda: defaultdict(float)) sum_uniq_bp = defaultdict(lambda: defaultdict(float)) query_info = {} - ksize,scaled,query_nhashes=None,None,None + ksize, scaled, query_nhashes=None, None, None for row in gather_results: # get essential gather info @@ -283,7 +283,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], total_f_classified = 0.0 total_bp_classified = 0 for lineage, fraction in sumgather_items: - query_ani=None + query_ani = None if fraction > 1: raise ValueError(f"The tax summary of query '{query_name}' is {fraction}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") elif fraction == 0: