From 3e6e9c1a39353070964d9c3688d7ea8517fcd26d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 14 Jan 2024 16:02:06 -0800 Subject: [PATCH 1/5] add explanation & default value to lca classify -h --- src/sourmash/cli/lca/classify.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sourmash/cli/lca/classify.py b/src/sourmash/cli/lca/classify.py index 00a7589cc2..7efe112bd8 100644 --- a/src/sourmash/cli/lca/classify.py +++ b/src/sourmash/cli/lca/classify.py @@ -9,7 +9,8 @@ def subparser(subparsers): help='query signatures to classify') subparser.add_argument('--query-from-file', help='file containing list of signature files to query') - subparser.add_argument('--threshold', metavar='T', type=int, default=5) + subparser.add_argument('--threshold', metavar='T', type=int, default=5, + help="minimum number of hashes needed for a taxonomic classification (default: 5)") subparser.add_argument( '--majority', action='store_true', help='use majority vote classification instead of lca' From bc8924fa67f1b2254e103c1e1656f9017aee3fa2 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 14 Jan 2024 16:03:11 -0800 Subject: [PATCH 2/5] fix download link for delmont subsample in lca tutorial --- doc/tutorials-lca.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/tutorials-lca.md b/doc/tutorials-lca.md index 6fe002c29f..73ea4e70fa 100644 --- a/doc/tutorials-lca.md +++ b/doc/tutorials-lca.md @@ -126,7 +126,7 @@ on the command line; separate them with `--db` or `--query`. Download some pre-calculated signatures: ``` -curl -L https://osf.io/bw8d7/download?version=1 -o delmont-subsample-sigs.tar.gz +curl -L https://osf.io/bw8d7/download -o delmont-subsample-sigs.tar.gz tar xzf delmont-subsample-sigs.tar.gz ``` From 0a6444a2d070f993aaa54ccc90ce907a233d0ded Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 14 Jan 2024 16:10:52 -0800 Subject: [PATCH 3/5] update magic numbers in benchmarks --- benchmarks/benchmarks.py | 81 +++++++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 31 deletions(-) diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index 375981299e..d8602d6cf4 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -1,28 +1,46 @@ -import os import random -from pathlib import Path from tempfile import NamedTemporaryFile - from sourmash.sbt_storage import ZipStorage from sourmash.minhash import MinHash +RANDOM_SEQ_SIZE=3000 +RANDOM_SEQ_SAMPLE=300 + +MINHASH_NUM=500 +MINHASH_K=21 + +GET_MINS_RANGE=500 +ADD_HASH_RANGE=10_000 +ADD_MANY_RANGE=1000 +SIMILARITY_TIMES=500 +COUNT_COMMON_TIMES=500 +MERGE_TIMES=500 +COPY_TIMES=500 +CONCAT_TIMES=500 +SET_ABUNDANCES_RANGE=500 +ZIP_STORAGE_WRITE=100_000 +ZIP_STORAGE_LOAD=20 + def load_sequences(): sequences = [] for i in range(10): - random_seq = random.sample("A,C,G,T".split(",") * 3000, 300) + random_seq = random.sample("A,C,G,T".split(",") * RANDOM_SEQ_SIZE, + RANDOM_SEQ_NUM) sequences.append("".join(random_seq)) return sequences class TimeMinHashSuite: def setup(self): - self.mh = MinHash(500, 21, track_abundance=False) - self.protein_mh = MinHash(500, 21, is_protein=True, track_abundance=False) + self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False) + self.protein_mh = MinHash(MINHASH_NUM, MINHASH_K, is_protein=True, + track_abundance=False) self.sequences = load_sequences() - self.populated_mh = MinHash(500, 21, track_abundance=False) + self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K, + track_abundance=False) for seq in self.sequences: self.populated_mh.add_sequence(seq) @@ -40,52 +58,53 @@ def time_add_protein(self): def time_get_mins(self): mh = self.populated_mh - for i in range(500): + for i in range(GET_MINS_RANGE): mh.get_mins() def time_add_hash(self): mh = self.mh - for i in range(10000): + for i in range(ADD_HASH_RANGE): mh.add_hash(i) def time_add_many(self): mh = self.mh - mh.add_many(list(range(1000))) + mh.add_many(list(range(ADD_MANY_RANGE))) def time_similarity(self): mh = self.mh other_mh = self.populated_mh - for i in range(500): + for i in range(SIMILARITY_TIMES): mh.similarity(other_mh) def time_count_common(self): mh = self.mh other_mh = self.populated_mh - for i in range(500): + for i in range(COUNT_COMMON_TIMES): mh.count_common(other_mh) def time_merge(self): mh = self.mh other_mh = self.populated_mh - for i in range(500): + for i in range(MERGE_TIMES): mh.merge(other_mh) def time_copy(self): mh = self.populated_mh - for i in range(500): + for i in range(COPY_TIMES): mh.__copy__() def time_concat(self): mh = self.mh other_mh = self.populated_mh - for i in range(500): + for i in range(CONCAT_TIMES): mh += other_mh class PeakmemMinHashSuite: def setup(self): - self.mh = MinHash(500, 21, track_abundance=True) - self.protein_mh = MinHash(500, 21, is_protein=True, track_abundance=True) + self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True) + self.protein_mh = MinHash(MINHASH_NUM, MINHASH_K, + is_protein=True, track_abundance=True) self.sequences = load_sequences() def peakmem_add_sequence(self): @@ -102,12 +121,12 @@ def peakmem_add_protein(self): def peakmem_add_hash(self): mh = self.mh - for i in range(10000): + for i in range(ADD_HASH_RANGE): mh.add_hash(i) def peakmem_add_many(self): mh = self.mh - mh.add_many(list(range(1000))) + mh.add_many(list(range(ADD_MANY_RANGE))) #################### @@ -116,33 +135,33 @@ def peakmem_add_many(self): class TimeMinAbundanceSuite(TimeMinHashSuite): def setup(self): TimeMinHashSuite.setup(self) - self.mh = MinHash(500, 21, track_abundance=True) + self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True) - self.populated_mh = MinHash(500, 21, track_abundance=True) + self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True) for seq in self.sequences: self.populated_mh.add_sequence(seq) def time_get_mins_abundance(self): mh = self.populated_mh - for i in range(500): + for i in range(GET_MINS_RANGE): mh.get_mins(with_abundance=True) def time_set_abundances(self): mh = self.mh mins = self.populated_mh.get_mins(with_abundance=True) - for i in range(500): + for i in range(SET_ABUNDANCES_RANGE): mh.set_abundances(mins) def time_set_abundances_noclear(self): mh = self.mh mins = self.populated_mh.get_mins(with_abundance=True) - for i in range(500): + for i in range(SET_ABUNDANCES_RANGE): mh.set_abundances(mins, clear=False) class PeakmemMinAbundanceSuite(PeakmemMinHashSuite): def setup(self): PeakmemMinHashSuite.setup(self) - self.mh = MinHash(500, 21, track_abundance=True) + self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True) #################### @@ -154,7 +173,7 @@ def setup(self): with zipfile.ZipFile(self.zipfile, mode='w', compression=zipfile.ZIP_STORED) as storage: - for i in range(100_000): + for i in range(ZIP_STORAGE_WRITE): # just so we have lots of entries storage.writestr(str(i), b"0") # one big-ish entry @@ -162,12 +181,12 @@ def setup(self): def time_load_from_zipstorage(self): with ZipStorage(self.zipfile.name) as storage: - for i in range(20): + for i in range(ZIP_STORAGE_LOAD): storage.load("sig1") def time_load_small_from_zipstorage(self): with ZipStorage(self.zipfile.name) as storage: - for i in range(20): + for i in range(ZIP_STORAGE_LOAD): storage.load("99999") def teardown(self): @@ -181,7 +200,7 @@ def setup(self): with zipfile.ZipFile(self.zipfile, mode='w', compression=zipfile.ZIP_STORED) as storage: - for i in range(100_000): + for i in range(ZIP_STORAGE_WRITE): # just so we have lots of entries storage.writestr(str(i), b"0") # one big-ish entry @@ -190,12 +209,12 @@ def setup(self): def peakmem_load_from_zipstorage(self): with ZipStorage(self.zipfile.name) as storage: - for i in range(20): + for i in range(ZIP_STORAGE_LOAD): storage.load("sig1") def peakmem_load_small_from_zipstorage(self): with ZipStorage(self.zipfile.name) as storage: - for i in range(20): + for i in range(ZIP_STORAGE_LOAD): storage.load("99999") def teardown(self): From eeea8382772311a9f3b83f04ee8a1e5dfa95ef62 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 15 Jan 2024 06:44:28 -0800 Subject: [PATCH 4/5] fix benchmarks.py --- benchmarks/benchmarks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index d8602d6cf4..b2b3d7180b 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -5,7 +5,7 @@ from sourmash.minhash import MinHash RANDOM_SEQ_SIZE=3000 -RANDOM_SEQ_SAMPLE=300 +RANDOM_SEQ_NUMBER=300 MINHASH_NUM=500 MINHASH_K=21 @@ -27,7 +27,7 @@ def load_sequences(): sequences = [] for i in range(10): random_seq = random.sample("A,C,G,T".split(",") * RANDOM_SEQ_SIZE, - RANDOM_SEQ_NUM) + RANDOM_SEQ_NUMBER) sequences.append("".join(random_seq)) return sequences From 2765bc13dbe918a94cc4aec9905c668f72f00202 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 15 Jan 2024 06:52:32 -0800 Subject: [PATCH 5/5] add a README --- benchmarks/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 benchmarks/README.md diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000000..9cec154692 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,9 @@ +# benchmarks for asv ([airspeed velocity](https://asv.readthedocs.io/en/stable/index.html)) + +The code in here is run by GitHub Actions during continuous integration. + +To test quickly, run: + +``` +asv run --show-stderr --quick +```