diff --git a/doc/command-line.md b/doc/command-line.md index 99619da023..3748f0f5c9 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -468,10 +468,9 @@ The sourmash `tax` or `taxonomy` commands integrate taxonomic taxonomic rank. For example, if the gather results for a metagenome include results for 30 different strains of a given species, we can sum the fraction uniquely matched to each strain to obtain the fraction - uniquely matched to this species. Note that this summarization can - also take into account abundance weighting; see - [classifying signatures](classifying-signatures.md) for more - information. + uniquely matched to this species. Alternatively, taxonomic summarization + can take into account abundance weighting; see + [classifying signatures](classifying-signatures.md) for more information. As with all reference-based analysis, results can be affected by the completeness of the reference database. However, summarizing taxonomic @@ -589,11 +588,17 @@ To produce multiple output types from the same command, add the types into the ### `sourmash tax genome` - classify a genome using `gather` results `sourmash tax genome` reports likely classification for each query, - based on `gather` matches. By default, classification requires at least 10% of - the query to be matched. Thus, if 10% of the query was matched to a species, the - species-level classification can be reported. However, if 7% of the query was - matched to one species, and an additional 5% matched to a different species in - the same genus, the genus-level classification will be reported. + based on `gather` matches. By default, classification requires at least 10% + of the query to be matched. Thus, if 10% of the query was matched to a species, + the species-level classification can be reported. However, if 7% of the query + was matched to one species, and an additional 5% matched to a different species + in the same genus, the genus-level classification will be reported. + +`sourmash tax genome` can use an ANI threshold (`--ani-threshold`) instead of a + containment threshold. This works the same way as the containment threshold + (and indeed, is using the same underlying information). Note that for DNA k-mers, + k=21 ANI is most similar to alignment-based ANI values, and ANI values should only + be compared if they were generated using the same ksize. Optionally, `genome` can instead report classifications at a desired `rank`, regardless of match threshold (`--rank` argument, e.g. `--rank species`). diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index 1725518747..2063fd09de 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -67,10 +67,14 @@ def range_limited_float_type(arg): return f -def add_tax_threshold_arg(parser, default=0.1): +def add_tax_threshold_arg(parser, containment_default=0.1, ani_default=None): parser.add_argument( - '--containment-threshold', default=default, type=range_limited_float_type, - help=f'minimum containment threshold for classification; default={default}' + '--containment-threshold', default=containment_default, type=range_limited_float_type, + help=f'minimum containment threshold for classification; default={containment_default}', + ) + parser.add_argument( + '--ani-threshold', '--aai-threshold', default=ani_default, type=range_limited_float_type, + help=f'minimum ANI threshold (nucleotide gather) or AAI threshold (protein gather) for classification; default={ani_default}', ) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 01fc6bbe1c..6bdf1b119f 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -99,7 +99,7 @@ def metagenome(args): seen_perfect = set() for rank in sourmash.lca.taxlist(include_strain=False): try: - summarized_gather[rank], seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, + summarized_gather[rank], seen_perfect, _ = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, seen_perfect = seen_perfect) @@ -179,12 +179,14 @@ def genome(args): sys.exit(-1) # if --rank is specified, classify to that rank + estimate_query_ani = True if args.rank: try: - best_at_rank, seen_perfect = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed, + best_at_rank, seen_perfect, estimate_query_ani = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, - best_only=True, seen_perfect=seen_perfect) + best_only=True, seen_perfect=seen_perfect, estimate_query_ani=True) + except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -194,27 +196,30 @@ def genome(args): status = 'nomatch' if sg.query_name in matched_queries: continue - if sg.fraction <= args.containment_threshold: + if args.ani_threshold and sg.query_ani_at_rank < args.ani_threshold: + status="below_threshold" + notify(f"WARNING: classifying query {sg.query_name} at desired rank {args.rank} does not meet query ANI/AAI threshold {args.ani_threshold}") + elif sg.fraction <= args.containment_threshold: # should this just be less than? status="below_threshold" notify(f"WARNING: classifying query {sg.query_name} at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") else: status="match" - classif = ClassificationResult(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename, sg.f_weighted_at_rank, sg.bp_match_at_rank) + classif = ClassificationResult(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename, sg.f_weighted_at_rank, sg.bp_match_at_rank, sg.query_ani_at_rank) classifications[args.rank].append(classif) matched_queries.add(sg.query_name) if "krona" in args.output_format: lin_list = display_lineage(sg.lineage).split(';') krona_results.append((sg.fraction, *lin_list)) else: - # classify to the match that passes the containment threshold. + # classify to the rank/match that passes the containment threshold. # To do - do we want to store anything for this match if nothing >= containment threshold? for rank in tax_utils.ascending_taxlist(include_strain=False): # gets best_at_rank for all queries in this gather_csv try: - best_at_rank, seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions = args.keep_identifier_versions, - best_only=True, seen_perfect=seen_perfect) + best_at_rank, seen_perfect, estimate_query_ani = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions = args.keep_identifier_versions, + best_only=True, seen_perfect=seen_perfect, estimate_query_ani=estimate_query_ani) except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -223,18 +228,26 @@ def genome(args): status = 'nomatch' if sg.query_name in matched_queries: continue - if sg.fraction >= args.containment_threshold: + if sg.query_ani_at_rank is not None and args.ani_threshold and sg.query_ani_at_rank >= args.ani_threshold: + status="match" + elif sg.fraction >= args.containment_threshold: status = "match" - classif = ClassificationResult(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename, sg.f_weighted_at_rank, sg.bp_match_at_rank) + if status == "match": + classif = ClassificationResult(query_name=sg.query_name, status=status, rank=sg.rank, + fraction=sg.fraction, lineage=sg.lineage, + query_md5=sg.query_md5, query_filename=sg.query_filename, + f_weighted_at_rank=sg.f_weighted_at_rank, bp_match_at_rank=sg.bp_match_at_rank, + query_ani_at_rank= sg.query_ani_at_rank) classifications[sg.rank].append(classif) matched_queries.add(sg.query_name) continue - if rank == "superkingdom" and status == "nomatch": + elif rank == "superkingdom" and status == "nomatch": status="below_threshold" classif = ClassificationResult(query_name=sg.query_name, status=status, rank="", fraction=0, lineage="", query_md5=sg.query_md5, query_filename=sg.query_filename, - f_weighted_at_rank=sg.f_weighted_at_rank, bp_match_at_rank=sg.bp_match_at_rank) + f_weighted_at_rank=sg.f_weighted_at_rank, bp_match_at_rank=sg.bp_match_at_rank, + query_ani_at_rank=sg.query_ani_at_rank) classifications[sg.rank].append(classif) if not any([classifications, krona_results]): diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index d5a7161afc..473a7fcce3 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -8,6 +8,7 @@ from sourmash import sqlite_utils from sourmash.exceptions import IndexNotSupported +from sourmash.distance_utils import containment_to_distance import sqlite3 @@ -24,9 +25,9 @@ from sourmash.logging import notify from sourmash.sourmash_args import load_pathlist_from_file -QueryInfo = namedtuple("QueryInfo", "query_md5, query_filename, query_bp") -SummarizedGatherResult = namedtuple("SummarizedGatherResult", "query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank") -ClassificationResult = namedtuple("ClassificationResult", "query_name, status, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank") +QueryInfo = namedtuple("QueryInfo", "query_md5, query_filename, query_bp, query_hashes") +SummarizedGatherResult = namedtuple("SummarizedGatherResult", "query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank") +ClassificationResult = namedtuple("ClassificationResult", "query_name, status, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank") # Essential Gather column names that must be in gather_csv to allow `tax` summarization EssentialGatherColnames = ('query_name', 'name', 'f_unique_weighted', 'f_unique_to_query', 'unique_intersect_bp', 'remaining_bp', 'query_md5', 'query_filename') @@ -188,7 +189,8 @@ def find_match_lineage(match_ident, tax_assign, *, skip_idents = [], def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], keep_full_identifiers=False, keep_identifier_versions=False, best_only=False, - seen_perfect=set()): + seen_perfect=set(), + estimate_query_ani=False): """ Summarize gather results at specified taxonomic rank """ @@ -198,6 +200,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], sum_uniq_to_query = defaultdict(lambda: defaultdict(float)) sum_uniq_bp = defaultdict(lambda: defaultdict(float)) query_info = {} + ksize, scaled, query_nhashes=None, None, None for row in gather_results: # get essential gather info @@ -208,13 +211,27 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], query_md5 = row['query_md5'] query_filename = row['query_filename'] # get query_bp - if query_name not in query_info.keys(): - query_bp = unique_intersect_bp + int(row['remaining_bp']) + if query_name not in query_info.keys(): #REMOVING THIS AFFECTS GATHER RESULTS!!! BUT query bp should always be same for same query? bug? + if "query_nhashes" in row.keys(): + query_nhashes = int(row["query_nhashes"]) + if "query_bp" in row.keys(): + query_bp = int(row["query_bp"]) + else: + query_bp = unique_intersect_bp + int(row['remaining_bp']) + # store query info - query_info[query_name] = QueryInfo(query_md5=query_md5, query_filename=query_filename, query_bp=query_bp) + query_info[query_name] = QueryInfo(query_md5=query_md5, query_filename=query_filename, query_bp=query_bp, query_hashes = query_nhashes) + + if estimate_query_ani and (not ksize or not scaled): # just need to set these once. BUT, if we have these, should we check for compatibility when loading the gather file? + if "ksize" in row.keys(): + ksize = int(row['ksize']) + scaled = int(row['scaled']) + else: + estimate_query_ani=False + notify("WARNING: Please run gather with sourmash >= 4.4 to estimate query ANI at rank. Continuing without ANI...") + match_ident = row['name'] - # 100% match? are we looking at something in the database? if f_unique_to_query >= 1.0 and query_name not in seen_perfect: # only want to notify once, not for each rank ident = get_ident(match_ident, @@ -225,16 +242,16 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], # get lineage for match lineage = find_match_lineage(match_ident, tax_assign, - skip_idents=skip_idents, - keep_full_identifiers=keep_full_identifiers, - keep_identifier_versions=keep_identifier_versions) + skip_idents=skip_idents, + keep_full_identifiers=keep_full_identifiers, + keep_identifier_versions=keep_identifier_versions) # ident was in skip_idents if not lineage: continue # summarize at rank! lineage = pop_to_rank(lineage, rank) - assert lineage[-1].rank == rank, (rank, lineage[-1]) + assert lineage[-1].rank == rank, lineage[-1] # record info sum_uniq_to_query[query_name][lineage] += f_unique_to_query sum_uniq_weighted[query_name][lineage] += f_uniq_weighted @@ -246,6 +263,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], qInfo = query_info[query_name] sumgather_items = list(lineage_weights.items()) sumgather_items.sort(key = lambda x: -x[1]) + query_ani = None if best_only: lineage, fraction = sumgather_items[0] if fraction > 1: @@ -254,13 +272,18 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], continue f_weighted_at_rank = sum_uniq_weighted[query_name][lineage] bp_intersect_at_rank = sum_uniq_bp[query_name][lineage] - sres = SummarizedGatherResult(query_name, rank, fraction, lineage, qInfo.query_md5, qInfo.query_filename, f_weighted_at_rank, bp_intersect_at_rank) + if estimate_query_ani: + query_ani = containment_to_distance(fraction, ksize, scaled, + n_unique_kmers= qInfo.query_hashes, sequence_len_bp= qInfo.query_bp).ani + sres = SummarizedGatherResult(query_name, rank, fraction, lineage, qInfo.query_md5, + qInfo.query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) else: total_f_weighted= 0.0 total_f_classified = 0.0 total_bp_classified = 0 for lineage, fraction in sumgather_items: + query_ani = None if fraction > 1: raise ValueError(f"The tax summary of query '{query_name}' is {fraction}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") elif fraction == 0: @@ -270,19 +293,25 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], total_f_weighted += f_weighted_at_rank bp_intersect_at_rank = int(sum_uniq_bp[query_name][lineage]) total_bp_classified += bp_intersect_at_rank - sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_intersect_at_rank) + if estimate_query_ani: + query_ani = containment_to_distance(fraction, ksize, scaled, + n_unique_kmers=qInfo.query_hashes, sequence_len_bp=qInfo.query_bp).ani + sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, + query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) # record unclassified lineage = () + query_ani = None fraction = 1.0 - total_f_classified if fraction > 0: f_weighted_at_rank = 1.0 - total_f_weighted bp_intersect_at_rank = qInfo.query_bp - total_bp_classified - sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_intersect_at_rank) + sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, + query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) - return sum_uniq_to_query_sorted, seen_perfect + return sum_uniq_to_query_sorted, seen_perfect, estimate_query_ani def find_missing_identities(gather_results, tax_assign): diff --git a/tests/test-data/tax/test1.gather_ani.csv b/tests/test-data/tax/test1.gather_ani.csv new file mode 100644 index 0000000000..48a09eb199 --- /dev/null +++ b/tests/test-data/tax/test1.gather_ani.csv @@ -0,0 +1,5 @@ +intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename,ksize,scaled,query_nhashes +442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,31,1000,5013970 +390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,31,1000,4571970 +138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,31,1000,4181970 +338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,31,1000,4327970 diff --git a/tests/test_search.py b/tests/test_search.py index 55fbf4a73a..1863c03bb2 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -481,6 +481,8 @@ def test_GatherResult(): assert res.match_md5 == ss4763.md5sum() assert res.query_n_hashes == len(ss47.minhash) assert res.match_n_hashes == len(ss4763.minhash) + assert res.query_bp == ss47.minhash.unique_dataset_hashes + assert res.match_bp == ss4763.minhash.unique_dataset_hashes assert res.md5 == ss4763.md5sum() assert res.name == ss4763.name assert res.match_filename == ss4763.filename diff --git a/tests/test_tax.py b/tests/test_tax.py index 1faf6ce19a..f900720798 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -705,6 +705,18 @@ def test_genome_rank_stdout_0_db(runtmp): assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0' in c.last_result.out + # too stringent of containment threshold: + c.run_sourmash('tax', 'genome', '--gather-csv', g_csv, '--taxonomy-csv', + tax, '--rank', 'species', '--containment-threshold', '1.0') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "WARNING: classifying query test1 at desired rank species does not meet containment threshold 1.0" in c.last_result.err + assert "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0," in c.last_result.out + def test_genome_rank_csv_0(runtmp): # test basic genome - output csv @@ -1366,6 +1378,110 @@ def test_genome_over100percent_error(runtmp): assert "ERROR: The tax summary of query 'test1' is 1.1, which is > 100% of the query!!" in runtmp.last_result.err +def test_genome_ani_threshold_input_errors(runtmp): + c = runtmp + g_csv = utils.get_test_data('tax/test1.gather_ani.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + below_threshold = "-1" + + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash('tax', 'genome', '-g', tax, '--taxonomy-csv', tax, + '--ani-threshold', below_threshold) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert "ERROR: Argument must be >0 and <1" in str(exc.value) + + above_threshold = "1.1" + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', above_threshold) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert "ERROR: Argument must be >0 and <1" in str(exc.value) + + not_a_float = "str" + + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', not_a_float) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert "ERROR: Must be a floating point number" in str(exc.value) + + +def test_genome_ani_threshold(runtmp): + c = runtmp + g_csv = utils.get_test_data('tax/test1.gather_ani.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', "0.95") + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "WARNING: Please run gather with sourmash >= 4.4 to estimate query ANI at rank. Continuing without ANI..." not in c.last_result.err + assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,0.9328896594471843' in c.last_result.out + + # more lax threshold + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', "0.9") + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0' in c.last_result.out + + # too stringent of threshold (using rank) + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', "1.0", '--rank', 'species') + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert "WARNING: classifying query test1 at desired rank species does not meet query ANI/AAI threshold 1.0" in c.last_result.err + assert "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0,0.9247805047263588" in c.last_result.out + + +def test_genome_ani_oldgather(runtmp): + # Ignore ANI if we don't have the information we need to estimate it + c = runtmp + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,' in c.last_result.out + + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', "0.95") + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "WARNING: Please run gather with sourmash >= 4.4 to estimate query ANI at rank. Continuing without ANI..." in c.last_result.err + assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,' in c.last_result.out + + def test_annotate_0(runtmp): # test annotate c = runtmp diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 449e26f972..febae40076 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1,6 +1,7 @@ """ Tests for functions in taxonomy submodule. """ + import pytest from os.path import basename @@ -22,9 +23,11 @@ from sourmash.lca.lca_utils import LineagePair # utility functions for testing -def make_mini_gather_results(g_infolist): +def make_mini_gather_results(g_infolist, include_ksize_and_scaled=False): # make mini gather_results min_header = ["query_name", "name", "match_ident", "f_unique_to_query", "query_md5", "query_filename", "f_unique_weighted", "unique_intersect_bp", "remaining_bp"] + if include_ksize_and_scaled: + min_header.extend(['ksize', 'scaled']) gather_results = [] for g_info in g_infolist: inf = dict(zip(min_header, g_info)) @@ -299,7 +302,7 @@ def test_summarize_gather_at_0(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) # superkingdom assert len(sk_sum) == 1 @@ -314,7 +317,7 @@ def test_summarize_gather_at_0(): assert sk_sum[0].bp_match_at_rank == 100 # phylum - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) + phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res) print("phylum summarized gather: ", phy_sum[0]) assert len(phy_sum) == 1 assert phy_sum[0].query_name == "queryA" @@ -326,7 +329,7 @@ def test_summarize_gather_at_0(): assert phy_sum[0].f_weighted_at_rank == 1.0 assert phy_sum[0].bp_match_at_rank == 100 # class - cl_sum, _ = summarize_gather_at("class", taxD, g_res) + cl_sum, _, _ = summarize_gather_at("class", taxD, g_res) assert len(cl_sum) == 2 print("class summarized gather: ", cl_sum) assert cl_sum[0].query_name == "queryA" @@ -351,30 +354,36 @@ def test_summarize_gather_at_0(): def test_summarize_gather_at_1(): """test two matches, diff f_unique_to_query""" # make mini gather_results - gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40'] - gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.1', '10', '90'] - g_res = make_mini_gather_results([gA,gB]) + ksize=31 + scaled=10 + gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40', ksize, scaled] + gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.1', '10', '90', ksize, scaled] + g_res = make_mini_gather_results([gA,gB], include_ksize_and_scaled=True) # make mini taxonomy gA_tax = ("gA", "a;b;c") gB_tax = ("gB", "a;b;d") taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res, estimate_query_ani=True) # superkingdom assert len(sk_sum) == 2 - print("superkingdom summarized gather: ", sk_sum[0]) + print("\nsuperkingdom summarized gather 0: ", sk_sum[0]) assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 0.7 assert sk_sum[0].bp_match_at_rank == 70 + print("superkingdom summarized gather 1: ", sk_sum[1]) assert sk_sum[1].lineage == () assert round(sk_sum[1].fraction, 1) == 0.3 assert sk_sum[1].bp_match_at_rank == 30 + assert sk_sum[0].query_ani_at_rank == 0.9885602934376099 + assert sk_sum[1].query_ani_at_rank == None # phylum - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) - print("phylum summarized gather: ", phy_sum[0]) + phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, estimate_query_ani=False) + print("phylum summarized gather 0: ", phy_sum[0]) + print("phylum summarized gather 1: ", phy_sum[1]) assert len(phy_sum) == 2 assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b')) assert phy_sum[0].fraction == 0.7 @@ -383,8 +392,10 @@ def test_summarize_gather_at_1(): assert phy_sum[1].lineage == () assert round(phy_sum[1].fraction, 1) == 0.3 assert phy_sum[1].bp_match_at_rank == 30 + assert phy_sum[0].query_ani_at_rank == None + assert phy_sum[1].query_ani_at_rank == None # class - cl_sum, _ = summarize_gather_at("class", taxD, g_res) + cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, estimate_query_ani=True) assert len(cl_sum) == 3 print("class summarized gather: ", cl_sum) assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -393,6 +404,7 @@ def test_summarize_gather_at_1(): assert cl_sum[0].fraction == 0.6 assert cl_sum[0].f_weighted_at_rank == 0.5 assert cl_sum[0].bp_match_at_rank == 60 + assert cl_sum[0].query_ani_at_rank == 0.9836567776983505 assert cl_sum[1].rank == 'class' assert cl_sum[1].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -401,8 +413,10 @@ def test_summarize_gather_at_1(): assert cl_sum[1].fraction == 0.1 assert cl_sum[1].f_weighted_at_rank == 0.1 assert cl_sum[1].bp_match_at_rank == 10 + assert cl_sum[1].query_ani_at_rank == 0.9284145445194744 assert cl_sum[2].lineage == () assert round(cl_sum[2].fraction, 1) == 0.3 + assert cl_sum[2].query_ani_at_rank == None def test_summarize_gather_at_perfect_match(): @@ -418,7 +432,7 @@ def test_summarize_gather_at_perfect_match(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) # superkingdom assert len(sk_sum) == 1 print("superkingdom summarized gather: ", sk_sum[0]) @@ -440,16 +454,16 @@ def test_summarize_gather_at_over100percent_f_unique_to_query(): # run summarize_gather_at and check results! with pytest.raises(ValueError) as exc: - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) assert "The tax summary of query 'queryA' is 1.1, which is > 100% of the query!!" in str(exc) # phylum with pytest.raises(ValueError) as exc: - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) + phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res) assert "The tax summary of query 'queryA' is 1.1, which is > 100% of the query!!" in str(exc) # class - cl_sum, _ = summarize_gather_at("class", taxD, g_res) + cl_sum, _, _ = summarize_gather_at("class", taxD, g_res) assert len(cl_sum) == 2 print("class summarized gather: ", cl_sum) assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -477,7 +491,7 @@ def test_summarize_gather_at_missing_ignore(): taxD = make_mini_taxonomy([gA_tax]) # run summarize_gather_at and check results! - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, skip_idents=['gB']) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res, skip_idents=['gB']) # superkingdom assert len(sk_sum) == 2 print("superkingdom summarized gather: ", sk_sum[0]) @@ -489,7 +503,7 @@ def test_summarize_gather_at_missing_ignore(): assert sk_sum[1].bp_match_at_rank == 50 # phylum - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB']) + phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB']) print("phylum summarized gather: ", phy_sum[0]) assert len(phy_sum) == 2 assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b')) @@ -499,7 +513,7 @@ def test_summarize_gather_at_missing_ignore(): assert phy_sum[1].fraction == 0.5 assert phy_sum[1].bp_match_at_rank == 50 # class - cl_sum, _ = summarize_gather_at("class", taxD, g_res, skip_idents=['gB']) + cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, skip_idents=['gB']) assert len(cl_sum) == 2 print("class summarized gather: ", cl_sum) assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -525,39 +539,45 @@ def test_summarize_gather_at_missing_fail(): # run summarize_gather_at and check results! with pytest.raises(ValueError) as exc: - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) assert "ident gB is not in the taxonomy database." in str(exc.value) def test_summarize_gather_at_best_only_0(): """test two matches, diff f_unique_to_query""" # make mini gather_results - gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40'] - gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.5', '10', '90'] - g_res = make_mini_gather_results([gA,gB]) + ksize =31 + scaled=10 + gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40', ksize, scaled] + gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.5', '10', '90', ksize, scaled] + g_res = make_mini_gather_results([gA,gB],include_ksize_and_scaled=True) # make mini taxonomy gA_tax = ("gA", "a;b;c") gB_tax = ("gB", "a;b;d") taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res, best_only=True,estimate_query_ani=True) # superkingdom assert len(sk_sum) == 1 print("superkingdom summarized gather: ", sk_sum[0]) assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 0.7 assert sk_sum[0].bp_match_at_rank == 70 + print("superk ANI:",sk_sum[0].query_ani_at_rank) + assert sk_sum[0].query_ani_at_rank == 0.9885602934376099 # phylum - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True) + phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True,estimate_query_ani=True) print("phylum summarized gather: ", phy_sum[0]) assert len(phy_sum) == 1 assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b')) assert phy_sum[0].fraction == 0.7 assert phy_sum[0].bp_match_at_rank == 70 + print("phy ANI:",phy_sum[0].query_ani_at_rank) + assert phy_sum[0].query_ani_at_rank == 0.9885602934376099 # class - cl_sum, _ = summarize_gather_at("class", taxD, g_res, best_only=True) + cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, best_only=True, estimate_query_ani=True) assert len(cl_sum) == 1 print("class summarized gather: ", cl_sum) assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -565,6 +585,8 @@ def test_summarize_gather_at_best_only_0(): LineagePair(rank='class', name='c')) assert cl_sum[0].fraction == 0.6 assert cl_sum[0].bp_match_at_rank == 60 + print("cl ANI:",cl_sum[0].query_ani_at_rank) + assert cl_sum[0].query_ani_at_rank == 0.9836567776983505 def test_summarize_gather_at_best_only_equal_choose_first(): @@ -581,7 +603,7 @@ def test_summarize_gather_at_best_only_equal_choose_first(): # run summarize_gather_at and check results! # class - cl_sum, _ = summarize_gather_at("class", taxD, g_res, best_only=True) + cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, best_only=True) assert len(cl_sum) == 1 print("class summarized gather: ", cl_sum) assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -597,12 +619,14 @@ def test_write_summary_csv(runtmp): sum_gather = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.0, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, - lineage=(LineagePair(rank='superkingdom', name='a'),))], + lineage=(LineagePair(rank='superkingdom', name='a'),), + query_ani_at_rank=None)], 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=1.0, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')))]} + LineagePair(rank='phylum', name='b')), + query_ani_at_rank=None)]} outs= runtmp.output("outsum.csv") with open(outs, 'w') as out_fp: @@ -610,9 +634,9 @@ def test_write_summary_csv(runtmp): sr = [x.rstrip().split(',') for x in open(outs, 'r')] print("gather_summary_results_from_file: \n", sr) - assert ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank'] == sr[0] - assert ['queryA', 'superkingdom', '1.0', 'a', 'queryA_md5', 'queryA.sig', '1.0', '100'] == sr[1] - assert ['queryA', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100'] == sr[2] + assert ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank'] == sr[0] + assert ['queryA', 'superkingdom', '1.0', 'a', 'queryA_md5', 'queryA.sig', '1.0', '100', ''] == sr[1] + assert ['queryA', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100',''] == sr[2] def test_write_classification(runtmp): @@ -620,7 +644,8 @@ def test_write_classification(runtmp): classif = ClassificationResult('queryA', 'match', 'phylum', 1.0, (LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')), - 'queryA_md5', 'queryA.sig', 1.0, 100) + 'queryA_md5', 'queryA.sig', 1.0, 100, + query_ani_at_rank=None) classification = {'phylum': [classif]} @@ -630,8 +655,8 @@ def test_write_classification(runtmp): sr = [x.rstrip().split(',') for x in open(outs, 'r')] print("gather_classification_results_from_file: \n", sr) - assert ['query_name', 'status', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank'] == sr[0] - assert ['queryA', 'match', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100'] == sr[1] + assert ['query_name', 'status', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank'] == sr[0] + assert ['queryA', 'match', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100', ''] == sr[1] def test_make_krona_header_0(): @@ -672,7 +697,7 @@ def test_aggregate_by_lineage_at_rank_by_query(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # aggregate by lineage at rank - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) print("superkingdom summarized gather results:", sk_sum) assert len(sk_sum) ==4 assert sk_sum[0].query_name == "queryA" @@ -701,7 +726,7 @@ def test_aggregate_by_lineage_at_rank_by_query(): assert num_queries == 2 assert query_names == ['queryA', 'queryB'] - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) + phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res) print("phylum summary:", phy_sum, ']\n') phy_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank(phy_sum, by_query=True) print("phylum lineage summary:", phy_lin_sum, '\n') @@ -725,13 +750,13 @@ def test_format_for_krona_0(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # check krona format and check results! - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) print("superkingdom summarized gather results:", sk_sum) krona_res = format_for_krona("superkingdom", {"superkingdom": sk_sum}) print("krona_res: ", krona_res) assert krona_res == [(1.0, 'a')] - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) + phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res) krona_res = format_for_krona("phylum", {"phylum": phy_sum}) print("krona_res: ", krona_res) assert krona_res == [(1.0, 'a', 'b')] @@ -753,7 +778,7 @@ def test_format_for_krona_1(): sum_res = {} #for rank in lca_utils.taxlist(include_strain=False): for rank in ['superkingdom', 'phylum', 'class']: - sum_res[rank], _ = summarize_gather_at(rank, taxD, g_res) + sum_res[rank], _, _ = summarize_gather_at(rank, taxD, g_res) print('summarized gather: ', sum_res) # check krona format sk_krona = format_for_krona("superkingdom", sum_res) @@ -783,7 +808,7 @@ def test_format_for_krona_best_only(): sum_res = {} #for rank in lca_utils.taxlist(include_strain=False): for rank in ['superkingdom', 'phylum', 'class']: - sum_res[rank], _ = summarize_gather_at(rank, taxD, g_res, best_only=True) + sum_res[rank], _, _ = summarize_gather_at(rank, taxD, g_res, best_only=True) print('summarized gather: ', sum_res) # check krona format sk_krona = format_for_krona("superkingdom", sum_res) @@ -816,21 +841,25 @@ def test_combine_sumgather_csvs_by_lineage(runtmp): sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, - lineage=(LineagePair(rank='superkingdom', name='a'),))], + lineage=(LineagePair(rank='superkingdom', name='a'),), + query_ani_at_rank=None)], 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')))]} + LineagePair(rank='phylum', name='b')), + query_ani_at_rank=None)]} sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(LineagePair(rank='superkingdom', name='a'),))], + lineage=(LineagePair(rank='superkingdom', name='a'),), + query_ani_at_rank=None)], 'phylum': [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='c')))]} + LineagePair(rank='phylum', name='c')), + query_ani_at_rank=None)]} # write summarized gather results csvs sg1= runtmp.output("sample1.csv") @@ -903,21 +932,25 @@ def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp): sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, - lineage=(LineagePair(rank='superkingdom', name='a'),))], + lineage=(LineagePair(rank='superkingdom', name='a'),), + query_ani_at_rank=None)], 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')))]} + LineagePair(rank='phylum', name='b')), + query_ani_at_rank=None)]} sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(LineagePair(rank='superkingdom', name='a'),))], + lineage=(LineagePair(rank='superkingdom', name='a'),), + query_ani_at_rank=None)], 'phylum': [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='c')))]} + LineagePair(rank='phylum', name='c')), + query_ani_at_rank=None)]} # write summarized gather results csvs sg1= runtmp.output("sample1.csv")