Skip to content

Commit

Permalink
MRG: fix remaining_bp output from sourmash gather (#3195)
Browse files Browse the repository at this point in the history
Adjusts `remaining_bp` from `sourmash gather` to take into account the
original size of the query sketch, not just the identified ones. Adds a
specific test, and updates documentation to provide a bit more detail,
too.

Fixes #3194

TODO:
- [x] actually fixme :)

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
ctb and pre-commit-ci[bot] authored Jun 10, 2024
1 parent 9045218 commit 6e84ecf
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 3 deletions.
4 changes: 2 additions & 2 deletions doc/classifying-signatures.md
Original file line number Diff line number Diff line change
Expand Up @@ -556,15 +556,15 @@ Note that order of columns is not guaranteed and may change between versions.
| `md5` | string | Full md5sum of the match sketch. |
| `f_match_orig` | float | The fraction of the match in the full query. Rank independent. |
| `gather_result_rank` | float | Rank of this match in the results. |
| `remaining_bp` | integer | How many bp remain in the query after subtracting this match, estimated by multiplying remaining hashes by scaled. |
| `remaining_bp` | integer | How many bp remain in the query after subtracting this match, estimated by multiplying remaining hashes by scaled. Starts at `query_n_hashes`. Unweighted. See `sum_weighted_found` for the related weighted value. |
| `query_filename` | string | The filename from which the query was loaded. |
| `query_name` | string | The query sketch name. |
| `query_md5` | string | Truncated md5sum of the query sketch. |
| `query_bp` | integer | Estimated number of bp in the query, estimated by multiplying the sketch size by scaled. |
| `ksize` | integer | K-mer size for the sketches used in the comparison. |
| `moltype` | string | Molecule type of the comparison. |
| `scaled` | integer | Scaled value of the comparison. |
| `query_n_hashes` | integer | Number of hashes in the query sketch. |
| `query_n_hashes` | integer | Number of hashes in the query sketch. Unweighted. See `total_weighted_hashes` for weighted value. |
| `query_abundance` | boolean | True if the query has abundance information; False otherwise. |
| `query_containment_ani` | float | ANI estimated from the query containment in the match. |
| `match_containment_ani` | float | ANI estimated from the match containment in the query. |
Expand Down
5 changes: 4 additions & 1 deletion src/sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,7 @@ class GatherResult(PrefetchResult):
orig_query_abunds: list = None
sum_weighted_found: int = None
total_weighted_hashes: int = None
noident_len: int = 0

gather_write_cols = [
"intersect_bp",
Expand Down Expand Up @@ -589,7 +590,8 @@ def build_gather_result(self):

# here, need to make sure to use the mh1_cmp (bc was downsampled to cmp_scaled)
self.remaining_bp = (
self.gather_comparison.mh1_cmp.unique_dataset_hashes
self.noident_len
+ self.gather_comparison.mh1_cmp.unique_dataset_hashes
- self.gather_comparison.total_unique_intersect_hashes
)

Expand Down Expand Up @@ -937,6 +939,7 @@ def __next__(self):
estimate_ani_ci=self.estimate_ani_ci,
sum_weighted_found=sum_weighted_found,
total_weighted_hashes=total_weighted_hashes,
noident_len=len(self.noident_mh) * self.noident_mh.scaled,
)

self.result_n += 1
Expand Down
71 changes: 71 additions & 0 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -4539,6 +4539,77 @@ def test_gather_abund_nomatch(runtmp, linear_gather, prefetch_gather):
assert "No matches found for --threshold-bp at 50.0 kbp." in runtmp.last_result.err


def test_gather_metagenome_3_thermo(runtmp):
# test gather matches in more detail.
match1 = "gather/GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig"
match2 = "gather/GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig"
match3 = "gather/GCF_000008545.1_ASM854v1_genomic.fna.gz.sig"

match1 = utils.get_test_data(match1)
match2 = utils.get_test_data(match2)
match3 = utils.get_test_data(match3)

query_sig = utils.get_test_data("gather/combined.sig")

runtmp.sourmash(
"gather",
query_sig,
match1,
match2,
match3,
"-k",
"21",
"--threshold-bp=0",
"-o",
"match3.csv",
)

print(runtmp.last_result.out)
print(runtmp.last_result.err)

outfile = runtmp.output("match3.csv")
with sourmash_args.FileInputCSV(outfile) as r:
rows = list(r)

assert len(rows) == 3

# first row
row = rows[0]
assert row["name"].startswith("NC_000853.1 ")
f_match = float(row["f_match"])
f_unique_to_query = round(float(row["f_unique_to_query"]), 5)
unique_intersect_bp = int(row["unique_intersect_bp"])
remaining_bp = int(row["remaining_bp"])
assert f_match == 1.0
assert f_unique_to_query == round(0.13096862, 5)
assert unique_intersect_bp == 1920000
assert remaining_bp == 12740000

# second row
row = rows[1]
assert row["name"].startswith("NC_011978.1 ")
f_match = float(row["f_match"])
f_unique_to_query = round(float(row["f_unique_to_query"]), 5)
unique_intersect_bp = int(row["unique_intersect_bp"])
remaining_bp = int(row["remaining_bp"])
assert round(f_match, 5) == round(0.898936170212766, 5)
assert f_unique_to_query == round(0.115279, 5)
assert unique_intersect_bp == 1690000
assert remaining_bp == 11050000

# third row
row = rows[2]
assert row["name"].startswith("NC_009486.1 ")
f_match = float(row["f_match"])
f_unique_to_query = round(float(row["f_unique_to_query"]), 5)
unique_intersect_bp = int(row["unique_intersect_bp"])
remaining_bp = int(row["remaining_bp"])
assert round(f_match, 5) == round(0.4842105, 5)
assert f_unique_to_query == round(0.0627557, 5)
assert unique_intersect_bp == 920000
assert remaining_bp == 10130000


def test_gather_metagenome(runtmp):
testdata_glob = utils.get_test_data("gather/GCF*.sig")
testdata_sigs = glob.glob(testdata_glob)
Expand Down

0 comments on commit 6e84ecf

Please sign in to comment.