Skip to content

Commit

Permalink
comments
Browse files Browse the repository at this point in the history
  • Loading branch information
luizirber committed Oct 31, 2020
1 parent 1c05661 commit 02e907a
Showing 1 changed file with 10 additions and 7 deletions.
17 changes: 10 additions & 7 deletions sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -1364,7 +1364,8 @@ def scaffold(original_datasets, storage, factory=None):
raise ValueError("unknown dataset type")
del original_datasets

# TODO: we can build the heap in parallel.
# TODO: we can build the heap in parallel, if the data was
# pickle-able for multiprocessing...
# on top of doing the count_common calculations in parallel,
# we can also avoid building the heap (just build a list first)
# and then call heapify on it after the list is ready
Expand Down Expand Up @@ -1393,7 +1394,8 @@ def scaffold(original_datasets, storage, factory=None):
# heapq defaults to a min heap,
# invert "common" here to avoid having to use the
# internal methods for a max heap
heapq.heappush(heap, (-common, i, j))
heap.append((-common, i, j))
heapq.heapify(heap)

if factory is None:
n_unique_hashes = len(hll)
Expand All @@ -1402,6 +1404,8 @@ def scaffold(original_datasets, storage, factory=None):
#htable_size *= num_htables
print(len(hll), num_htables, htable_size)

# TODO: turns out len(hll) is too big in most cases.
# need a better heuristic for optimal size...
htable_size = 1e5
num_htables = 1

Expand Down Expand Up @@ -1471,16 +1475,15 @@ def scaffold(original_datasets, storage, factory=None):
next_round = []
total_nodes = len(current_round)

# TODO: we can build the heap in parallel.
# on top of doing the intersection_count calculations in parallel,
# we can also avoid building the heap (just build a list first)
# and then call heapify on it after the list is ready
# TODO: we can build the heap in parallel, if the data was
# pickle-able for multiprocessing...
heap = []
for (i, d1) in enumerate(current_round):
for (j, d2) in enumerate(current_round):
if i > j:
common = d1.element.data.intersection_count(d2.element.data)
heapq.heappush(heap, (-common, i, j))
heap.append((-common, i, j))
heapq.heapify(heap)

processed = set()
while heap:
Expand Down

0 comments on commit 02e907a

Please sign in to comment.