comments

sourmash-bio · Oct 31, 2020 · 02e907a · 02e907a
1 parent 1c05661
commit 02e907a
Showing 1 changed file with 10 additions and 7 deletions.
diff --git a/sourmash/sbt.py b/sourmash/sbt.py
@@ -1364,7 +1364,8 @@ def scaffold(original_datasets, storage, factory=None):
             raise ValueError("unknown dataset type")
     del original_datasets
 
-    # TODO: we can build the heap in parallel.
+    # TODO: we can build the heap in parallel, if the data was
+    # pickle-able for multiprocessing...
     # on top of doing the count_common calculations in parallel,
     # we can also avoid building the heap (just build a list first)
     # and then call heapify on it after the list is ready
@@ -1393,7 +1394,8 @@ def scaffold(original_datasets, storage, factory=None):
                     # heapq defaults to a min heap,
                     # invert "common" here to avoid having to use the
                     # internal methods for a max heap
-                    heapq.heappush(heap, (-common, i, j))
+                    heap.append((-common, i, j))
+    heapq.heapify(heap)
 
     if factory is None:
         n_unique_hashes = len(hll)
@@ -1402,6 +1404,8 @@ def scaffold(original_datasets, storage, factory=None):
         #htable_size *= num_htables
         print(len(hll), num_htables, htable_size)
 
+        # TODO: turns out len(hll) is too big in most cases.
+        # need a better heuristic for optimal size...
         htable_size = 1e5
         num_htables = 1
 
@@ -1471,16 +1475,15 @@ def scaffold(original_datasets, storage, factory=None):
         next_round = []
         total_nodes = len(current_round)
 
-        # TODO: we can build the heap in parallel.
-        # on top of doing the intersection_count calculations in parallel,
-        # we can also avoid building the heap (just build a list first)
-        # and then call heapify on it after the list is ready
+        # TODO: we can build the heap in parallel, if the data was
+        # pickle-able for multiprocessing...
         heap = []
         for (i, d1) in enumerate(current_round):
             for (j, d2) in enumerate(current_round):
                 if i > j:
                     common = d1.element.data.intersection_count(d2.element.data)
-                    heapq.heappush(heap, (-common, i, j))
+                    heap.append((-common, i, j))
+        heapq.heapify(heap)
 
         processed = set()
         while heap: