From 3e6e9c1a39353070964d9c3688d7ea8517fcd26d Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 14 Jan 2024 16:02:06 -0800
Subject: [PATCH 1/5] add explanation & default value to lca classify -h

---
 src/sourmash/cli/lca/classify.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/sourmash/cli/lca/classify.py b/src/sourmash/cli/lca/classify.py
index 00a7589cc2..7efe112bd8 100644
--- a/src/sourmash/cli/lca/classify.py
+++ b/src/sourmash/cli/lca/classify.py
@@ -9,7 +9,8 @@ def subparser(subparsers):
                            help='query signatures to classify')
     subparser.add_argument('--query-from-file',
                            help='file containing list of signature files to query')
-    subparser.add_argument('--threshold', metavar='T', type=int, default=5)
+    subparser.add_argument('--threshold', metavar='T', type=int, default=5,
+                           help="minimum number of hashes needed for a taxonomic classification (default: 5)")
     subparser.add_argument(
         '--majority', action='store_true',
         help='use majority vote classification instead of lca'

From bc8924fa67f1b2254e103c1e1656f9017aee3fa2 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 14 Jan 2024 16:03:11 -0800
Subject: [PATCH 2/5] fix download link for delmont subsample in lca tutorial

---
 doc/tutorials-lca.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/tutorials-lca.md b/doc/tutorials-lca.md
index 6fe002c29f..73ea4e70fa 100644
--- a/doc/tutorials-lca.md
+++ b/doc/tutorials-lca.md
@@ -126,7 +126,7 @@ on the command line; separate them with `--db` or `--query`.
 Download some pre-calculated signatures:
 
 ```
-curl -L https://osf.io/bw8d7/download?version=1 -o delmont-subsample-sigs.tar.gz
+curl -L https://osf.io/bw8d7/download -o delmont-subsample-sigs.tar.gz
 tar xzf delmont-subsample-sigs.tar.gz
 ```
 

From 0a6444a2d070f993aaa54ccc90ce907a233d0ded Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 14 Jan 2024 16:10:52 -0800
Subject: [PATCH 3/5] update magic numbers in benchmarks

---
 benchmarks/benchmarks.py | 81 +++++++++++++++++++++++++---------------
 1 file changed, 50 insertions(+), 31 deletions(-)

diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
index 375981299e..d8602d6cf4 100644
--- a/benchmarks/benchmarks.py
+++ b/benchmarks/benchmarks.py
@@ -1,28 +1,46 @@
-import os
 import random
-from pathlib import Path
 from tempfile import NamedTemporaryFile
 
-
 from sourmash.sbt_storage import ZipStorage
 from sourmash.minhash import MinHash
 
+RANDOM_SEQ_SIZE=3000
+RANDOM_SEQ_SAMPLE=300
+
+MINHASH_NUM=500
+MINHASH_K=21
+
+GET_MINS_RANGE=500
+ADD_HASH_RANGE=10_000
+ADD_MANY_RANGE=1000
+SIMILARITY_TIMES=500
+COUNT_COMMON_TIMES=500
+MERGE_TIMES=500
+COPY_TIMES=500
+CONCAT_TIMES=500
+SET_ABUNDANCES_RANGE=500
+ZIP_STORAGE_WRITE=100_000
+ZIP_STORAGE_LOAD=20
+
 
 def load_sequences():
     sequences = []
     for i in range(10):
-        random_seq = random.sample("A,C,G,T".split(",") * 3000, 300)
+        random_seq = random.sample("A,C,G,T".split(",") * RANDOM_SEQ_SIZE,
+                                   RANDOM_SEQ_NUM)
         sequences.append("".join(random_seq))
     return sequences
 
 
 class TimeMinHashSuite:
     def setup(self):
-        self.mh = MinHash(500, 21, track_abundance=False)
-        self.protein_mh = MinHash(500, 21, is_protein=True, track_abundance=False)
+        self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False)
+        self.protein_mh = MinHash(MINHASH_NUM, MINHASH_K, is_protein=True,
+                                  track_abundance=False)
         self.sequences = load_sequences()
 
-        self.populated_mh = MinHash(500, 21, track_abundance=False)
+        self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K,
+                                    track_abundance=False)
         for seq in self.sequences:
             self.populated_mh.add_sequence(seq)
 
@@ -40,52 +58,53 @@ def time_add_protein(self):
 
     def time_get_mins(self):
         mh = self.populated_mh
-        for i in range(500):
+        for i in range(GET_MINS_RANGE):
             mh.get_mins()
 
     def time_add_hash(self):
         mh = self.mh
-        for i in range(10000):
+        for i in range(ADD_HASH_RANGE):
             mh.add_hash(i)
 
     def time_add_many(self):
         mh = self.mh
-        mh.add_many(list(range(1000)))
+        mh.add_many(list(range(ADD_MANY_RANGE)))
 
     def time_similarity(self):
         mh = self.mh
         other_mh = self.populated_mh
-        for i in range(500):
+        for i in range(SIMILARITY_TIMES):
             mh.similarity(other_mh)
 
     def time_count_common(self):
         mh = self.mh
         other_mh = self.populated_mh
-        for i in range(500):
+        for i in range(COUNT_COMMON_TIMES):
             mh.count_common(other_mh)
 
     def time_merge(self):
         mh = self.mh
         other_mh = self.populated_mh
-        for i in range(500):
+        for i in range(MERGE_TIMES):
             mh.merge(other_mh)
 
     def time_copy(self):
         mh = self.populated_mh
-        for i in range(500):
+        for i in range(COPY_TIMES):
             mh.__copy__()
 
     def time_concat(self):
         mh = self.mh
         other_mh = self.populated_mh
-        for i in range(500):
+        for i in range(CONCAT_TIMES):
             mh += other_mh
 
 
 class PeakmemMinHashSuite:
     def setup(self):
-        self.mh = MinHash(500, 21, track_abundance=True)
-        self.protein_mh = MinHash(500, 21, is_protein=True, track_abundance=True)
+        self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
+        self.protein_mh = MinHash(MINHASH_NUM, MINHASH_K,
+                                  is_protein=True, track_abundance=True)
         self.sequences = load_sequences()
 
     def peakmem_add_sequence(self):
@@ -102,12 +121,12 @@ def peakmem_add_protein(self):
 
     def peakmem_add_hash(self):
         mh = self.mh
-        for i in range(10000):
+        for i in range(ADD_HASH_RANGE):
             mh.add_hash(i)
 
     def peakmem_add_many(self):
         mh = self.mh
-        mh.add_many(list(range(1000)))
+        mh.add_many(list(range(ADD_MANY_RANGE)))
 
 
 ####################
@@ -116,33 +135,33 @@ def peakmem_add_many(self):
 class TimeMinAbundanceSuite(TimeMinHashSuite):
     def setup(self):
         TimeMinHashSuite.setup(self)
-        self.mh = MinHash(500, 21, track_abundance=True)
+        self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
 
-        self.populated_mh = MinHash(500, 21, track_abundance=True)
+        self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
         for seq in self.sequences:
             self.populated_mh.add_sequence(seq)
 
     def time_get_mins_abundance(self):
         mh = self.populated_mh
-        for i in range(500):
+        for i in range(GET_MINS_RANGE):
             mh.get_mins(with_abundance=True)
 
     def time_set_abundances(self):
         mh = self.mh
         mins = self.populated_mh.get_mins(with_abundance=True)
-        for i in range(500):
+        for i in range(SET_ABUNDANCES_RANGE):
             mh.set_abundances(mins)
 
     def time_set_abundances_noclear(self):
         mh = self.mh
         mins = self.populated_mh.get_mins(with_abundance=True)
-        for i in range(500):
+        for i in range(SET_ABUNDANCES_RANGE):
             mh.set_abundances(mins, clear=False)
 
 class PeakmemMinAbundanceSuite(PeakmemMinHashSuite):
     def setup(self):
         PeakmemMinHashSuite.setup(self)
-        self.mh = MinHash(500, 21, track_abundance=True)
+        self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
 
 ####################
 
@@ -154,7 +173,7 @@ def setup(self):
 
         with zipfile.ZipFile(self.zipfile, mode='w',
                           compression=zipfile.ZIP_STORED) as storage:
-            for i in range(100_000):
+            for i in range(ZIP_STORAGE_WRITE):
                 # just so we have lots of entries
                 storage.writestr(str(i), b"0")
             # one big-ish entry
@@ -162,12 +181,12 @@ def setup(self):
 
     def time_load_from_zipstorage(self):
         with ZipStorage(self.zipfile.name) as storage:
-            for i in range(20):
+            for i in range(ZIP_STORAGE_LOAD):
                 storage.load("sig1")
 
     def time_load_small_from_zipstorage(self):
         with ZipStorage(self.zipfile.name) as storage:
-            for i in range(20):
+            for i in range(ZIP_STORAGE_LOAD):
                 storage.load("99999")
 
     def teardown(self):
@@ -181,7 +200,7 @@ def setup(self):
 
         with zipfile.ZipFile(self.zipfile, mode='w',
                           compression=zipfile.ZIP_STORED) as storage:
-            for i in range(100_000):
+            for i in range(ZIP_STORAGE_WRITE):
                 # just so we have lots of entries
                 storage.writestr(str(i), b"0")
             # one big-ish entry
@@ -190,12 +209,12 @@ def setup(self):
 
     def peakmem_load_from_zipstorage(self):
         with ZipStorage(self.zipfile.name) as storage:
-            for i in range(20):
+            for i in range(ZIP_STORAGE_LOAD):
                 storage.load("sig1")
 
     def peakmem_load_small_from_zipstorage(self):
         with ZipStorage(self.zipfile.name) as storage:
-            for i in range(20):
+            for i in range(ZIP_STORAGE_LOAD):
                 storage.load("99999")
 
     def teardown(self):

From eeea8382772311a9f3b83f04ee8a1e5dfa95ef62 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Mon, 15 Jan 2024 06:44:28 -0800
Subject: [PATCH 4/5] fix benchmarks.py

---
 benchmarks/benchmarks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
index d8602d6cf4..b2b3d7180b 100644
--- a/benchmarks/benchmarks.py
+++ b/benchmarks/benchmarks.py
@@ -5,7 +5,7 @@
 from sourmash.minhash import MinHash
 
 RANDOM_SEQ_SIZE=3000
-RANDOM_SEQ_SAMPLE=300
+RANDOM_SEQ_NUMBER=300
 
 MINHASH_NUM=500
 MINHASH_K=21
@@ -27,7 +27,7 @@ def load_sequences():
     sequences = []
     for i in range(10):
         random_seq = random.sample("A,C,G,T".split(",") * RANDOM_SEQ_SIZE,
-                                   RANDOM_SEQ_NUM)
+                                   RANDOM_SEQ_NUMBER)
         sequences.append("".join(random_seq))
     return sequences
 

From 2765bc13dbe918a94cc4aec9905c668f72f00202 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Mon, 15 Jan 2024 06:52:32 -0800
Subject: [PATCH 5/5] add a README

---
 benchmarks/README.md | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 benchmarks/README.md

diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 0000000000..9cec154692
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,9 @@
+# benchmarks for asv ([airspeed velocity](https://asv.readthedocs.io/en/stable/index.html))
+
+The code in here is run by GitHub Actions during continuous integration.
+
+To test quickly, run:
+
+```
+asv run --show-stderr --quick  
+```