Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: fix benchmark code & a few other small issues from pyOpenSci review #2920

Merged
merged 5 commits into from
Jan 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# benchmarks for asv ([airspeed velocity](https://asv.readthedocs.io/en/stable/index.html))

The code in here is run by GitHub Actions during continuous integration.

To test quickly, run:

```
asv run --show-stderr --quick
```
81 changes: 50 additions & 31 deletions benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,46 @@
import os
import random
from pathlib import Path
from tempfile import NamedTemporaryFile


from sourmash.sbt_storage import ZipStorage
from sourmash.minhash import MinHash

RANDOM_SEQ_SIZE=3000
RANDOM_SEQ_NUMBER=300

MINHASH_NUM=500
MINHASH_K=21

GET_MINS_RANGE=500
ADD_HASH_RANGE=10_000
ADD_MANY_RANGE=1000
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will the underscore throw an error in range() ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as it turns out, that is a valid Python way to write 10e5!

SIMILARITY_TIMES=500
COUNT_COMMON_TIMES=500
MERGE_TIMES=500
COPY_TIMES=500
CONCAT_TIMES=500
SET_ABUNDANCES_RANGE=500
ZIP_STORAGE_WRITE=100_000
ZIP_STORAGE_LOAD=20


def load_sequences():
sequences = []
for i in range(10):
random_seq = random.sample("A,C,G,T".split(",") * 3000, 300)
random_seq = random.sample("A,C,G,T".split(",") * RANDOM_SEQ_SIZE,
RANDOM_SEQ_NUMBER)
sequences.append("".join(random_seq))
return sequences


class TimeMinHashSuite:
def setup(self):
self.mh = MinHash(500, 21, track_abundance=False)
self.protein_mh = MinHash(500, 21, is_protein=True, track_abundance=False)
self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False)
self.protein_mh = MinHash(MINHASH_NUM, MINHASH_K, is_protein=True,
track_abundance=False)
self.sequences = load_sequences()

self.populated_mh = MinHash(500, 21, track_abundance=False)
self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K,
track_abundance=False)
for seq in self.sequences:
self.populated_mh.add_sequence(seq)

Expand All @@ -40,52 +58,53 @@ def time_add_protein(self):

def time_get_mins(self):
mh = self.populated_mh
for i in range(500):
for i in range(GET_MINS_RANGE):
mh.get_mins()

def time_add_hash(self):
mh = self.mh
for i in range(10000):
for i in range(ADD_HASH_RANGE):
mh.add_hash(i)

def time_add_many(self):
mh = self.mh
mh.add_many(list(range(1000)))
mh.add_many(list(range(ADD_MANY_RANGE)))

def time_similarity(self):
mh = self.mh
other_mh = self.populated_mh
for i in range(500):
for i in range(SIMILARITY_TIMES):
mh.similarity(other_mh)

def time_count_common(self):
mh = self.mh
other_mh = self.populated_mh
for i in range(500):
for i in range(COUNT_COMMON_TIMES):
mh.count_common(other_mh)

def time_merge(self):
mh = self.mh
other_mh = self.populated_mh
for i in range(500):
for i in range(MERGE_TIMES):
mh.merge(other_mh)

def time_copy(self):
mh = self.populated_mh
for i in range(500):
for i in range(COPY_TIMES):
mh.__copy__()

def time_concat(self):
mh = self.mh
other_mh = self.populated_mh
for i in range(500):
for i in range(CONCAT_TIMES):
mh += other_mh


class PeakmemMinHashSuite:
def setup(self):
self.mh = MinHash(500, 21, track_abundance=True)
self.protein_mh = MinHash(500, 21, is_protein=True, track_abundance=True)
self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
self.protein_mh = MinHash(MINHASH_NUM, MINHASH_K,
is_protein=True, track_abundance=True)
self.sequences = load_sequences()

def peakmem_add_sequence(self):
Expand All @@ -102,12 +121,12 @@ def peakmem_add_protein(self):

def peakmem_add_hash(self):
mh = self.mh
for i in range(10000):
for i in range(ADD_HASH_RANGE):
mh.add_hash(i)

def peakmem_add_many(self):
mh = self.mh
mh.add_many(list(range(1000)))
mh.add_many(list(range(ADD_MANY_RANGE)))


####################
Expand All @@ -116,33 +135,33 @@ def peakmem_add_many(self):
class TimeMinAbundanceSuite(TimeMinHashSuite):
def setup(self):
TimeMinHashSuite.setup(self)
self.mh = MinHash(500, 21, track_abundance=True)
self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)

self.populated_mh = MinHash(500, 21, track_abundance=True)
self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
for seq in self.sequences:
self.populated_mh.add_sequence(seq)

def time_get_mins_abundance(self):
mh = self.populated_mh
for i in range(500):
for i in range(GET_MINS_RANGE):
mh.get_mins(with_abundance=True)

def time_set_abundances(self):
mh = self.mh
mins = self.populated_mh.get_mins(with_abundance=True)
for i in range(500):
for i in range(SET_ABUNDANCES_RANGE):
mh.set_abundances(mins)

def time_set_abundances_noclear(self):
mh = self.mh
mins = self.populated_mh.get_mins(with_abundance=True)
for i in range(500):
for i in range(SET_ABUNDANCES_RANGE):
mh.set_abundances(mins, clear=False)

class PeakmemMinAbundanceSuite(PeakmemMinHashSuite):
def setup(self):
PeakmemMinHashSuite.setup(self)
self.mh = MinHash(500, 21, track_abundance=True)
self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)

####################

Expand All @@ -154,20 +173,20 @@ def setup(self):

with zipfile.ZipFile(self.zipfile, mode='w',
compression=zipfile.ZIP_STORED) as storage:
for i in range(100_000):
for i in range(ZIP_STORAGE_WRITE):
# just so we have lots of entries
storage.writestr(str(i), b"0")
# one big-ish entry
storage.writestr("sig1", b"9" * 1_000_000)

def time_load_from_zipstorage(self):
with ZipStorage(self.zipfile.name) as storage:
for i in range(20):
for i in range(ZIP_STORAGE_LOAD):
storage.load("sig1")

def time_load_small_from_zipstorage(self):
with ZipStorage(self.zipfile.name) as storage:
for i in range(20):
for i in range(ZIP_STORAGE_LOAD):
storage.load("99999")

def teardown(self):
Expand All @@ -181,7 +200,7 @@ def setup(self):

with zipfile.ZipFile(self.zipfile, mode='w',
compression=zipfile.ZIP_STORED) as storage:
for i in range(100_000):
for i in range(ZIP_STORAGE_WRITE):
# just so we have lots of entries
storage.writestr(str(i), b"0")
# one big-ish entry
Expand All @@ -190,12 +209,12 @@ def setup(self):

def peakmem_load_from_zipstorage(self):
with ZipStorage(self.zipfile.name) as storage:
for i in range(20):
for i in range(ZIP_STORAGE_LOAD):
storage.load("sig1")

def peakmem_load_small_from_zipstorage(self):
with ZipStorage(self.zipfile.name) as storage:
for i in range(20):
for i in range(ZIP_STORAGE_LOAD):
storage.load("99999")

def teardown(self):
Expand Down
2 changes: 1 addition & 1 deletion doc/tutorials-lca.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ on the command line; separate them with `--db` or `--query`.
Download some pre-calculated signatures:

```
curl -L https://osf.io/bw8d7/download?version=1 -o delmont-subsample-sigs.tar.gz
curl -L https://osf.io/bw8d7/download -o delmont-subsample-sigs.tar.gz
tar xzf delmont-subsample-sigs.tar.gz
```

Expand Down
3 changes: 2 additions & 1 deletion src/sourmash/cli/lca/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ def subparser(subparsers):
help='query signatures to classify')
subparser.add_argument('--query-from-file',
help='file containing list of signature files to query')
subparser.add_argument('--threshold', metavar='T', type=int, default=5)
subparser.add_argument('--threshold', metavar='T', type=int, default=5,
help="minimum number of hashes needed for a taxonomic classification (default: 5)")
subparser.add_argument(
'--majority', action='store_true',
help='use majority vote classification instead of lca'
Expand Down
Loading