From 69edb6090fea99ea4a207e1829f1afbc2f8d6172 Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Tue, 12 Mar 2024 14:05:13 -0700 Subject: [PATCH] MRG: enable loading lineages from annotated gather with match_name instead of name (#3078) This PR enables loading from gather lineages files that contain 'match_name' instead of name. Rationale: - we generally plan to replace 'name' with 'match_name' in gather output (https://github.com/sourmash-bio/sourmash/issues/1555) - branchwater plugin's fastmultigather already uses 'match_name' --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- src/sourmash/tax/tax_utils.py | 15 ++++++++--- .../tax/test1.gather.with-lineages.csv | 5 ++++ tests/test_tax_utils.py | 25 ++++++++++++++++++- 3 files changed, 40 insertions(+), 5 deletions(-) create mode 100644 tests/test-data/tax/test1.gather.with-lineages.csv diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 55feed66d2..1615c90d74 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1274,7 +1274,9 @@ def load( elif "accession" in header: identifier = "accession" header = ["ident" if "accession" == x else x for x in header] - elif "name" in header and "lineage" in header: + elif "lineage" in header and any( + ["name" in header, "match_name" in header] + ): return cls.load_from_gather_with_lineages( filename, force=force, lins=lins, ictv=ictv ) @@ -1390,9 +1392,14 @@ def load_from_gather_with_lineages( if not header: raise ValueError(f"cannot read taxonomy assignments from {filename}") - if "name" not in header or "lineage" not in header: + ident_col = None + if "name" in header: + ident_col = "name" + elif "match_name" in header: + ident_col = "match_name" + if "lineage" not in header or ident_col is None: raise ValueError( - "Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?" + "Expected headers 'name'/'match_name' and 'lineage' not found. Is this a with-lineages file?" ) ranks = None @@ -1405,7 +1412,7 @@ def load_from_gather_with_lineages( for n, row in enumerate(r): num_rows += 1 - name = row["name"] + name = row[ident_col] ident = get_ident(name) if lins: diff --git a/tests/test-data/tax/test1.gather.with-lineages.csv b/tests/test-data/tax/test1.gather.with-lineages.csv new file mode 100644 index 0000000000..1c81221737 --- /dev/null +++ b/tests/test-data/tax/test1.gather.with-lineages.csv @@ -0,0 +1,5 @@ +intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename,query_bp,ksize,scaled,query_n_hashes,lineage +442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,5014000,31,1000,2507,Bacteria;Pseudomonadota;Gammaproteobacteria;Enterobacterales;Enterobacteriaceae;Escherichia;Escherichia coli +390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,5014000,31,1000,2507,Bacteria;Bacteroidota;Bacteroidia;Bacteroidales;Prevotellaceae;Prevotella;Prevotella copri +138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,5014000,31,1000,2507,Bacteria;Bacteroidota;Bacteroidia;Bacteroidales;Bacteroidaceae;Phocaeicola;Phocaeicola vulgatus +338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,5014000,31,1000,2507,Bacteria;Bacteroidota;Bacteroidia;Bacteroidales;Prevotellaceae;Prevotella;Prevotella copri diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 192406e251..dfca20628a 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1014,7 +1014,7 @@ def test_check_and_load_gather_lineage_csvs_bad_header(runtmp): with pytest.raises(ValueError) as exc: LineageDB.load_from_gather_with_lineages(g_res) assert ( - "Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?" + "Expected headers 'name'/'match_name' and 'lineage' not found. Is this a with-lineages file?" in str(exc.value) ) @@ -1038,6 +1038,29 @@ def test_check_and_load_gather_lineage_csvs_isdir(runtmp): assert "is a directory" in str(exc.value) +def test_check_and_load_gather_lineage_csvs_name(runtmp): + # test loading a with-lineage file that has 'name', not 'match_name' + g_res = utils.get_test_data("tax/test1.gather.with-lineages.csv") + + lins = LineageDB.load_from_gather_with_lineages(g_res) + assert len(lins) == 4 + + +def test_check_and_load_gather_lineage_csvs_match_name(runtmp): + # test loading a with-lineage file that has 'match_name' instead of 'name' + g_res = utils.get_test_data("tax/test1.gather.with-lineages.csv") + out_lins = runtmp.output("match-name.lineages.csv") + with open(g_res) as f_in: + first_line = f_in.readline().replace("name", "match_name") + with open(out_lins, "w") as f_out: + f_out.write(first_line) + for line in f_in: + f_out.write(line) + + lins = LineageDB.load_from_gather_with_lineages(out_lins) + assert len(lins) == 4 + + def test_check_and_load_gather_csvs_fail_on_missing(runtmp): g_csv = utils.get_test_data("tax/test1.gather.csv") # make gather results with taxonomy name not in tax_assign