From 9944115f3e011be81a3bd271ee9990e442a8010f Mon Sep 17 00:00:00 2001 From: pranathivemuri Date: Mon, 3 May 2021 11:38:29 -0700 Subject: [PATCH 1/4] figure out if compare can have function inside a function --- src/sourmash/compare.py | 72 +++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 42 deletions(-) diff --git a/src/sourmash/compare.py b/src/sourmash/compare.py index 18d6001ffd..09ee6dee6f 100644 --- a/src/sourmash/compare.py +++ b/src/sourmash/compare.py @@ -92,38 +92,6 @@ def similarity_args_unpack(args, ignore_abundance, downsample): downsample=downsample) -def get_similarities_at_index(index, ignore_abundance, downsample, siglist): - """Returns similarities of all the combinations of signature at index in - the siglist with the rest of the indices starting at index + 1. Doesn't - redundantly calculate signatures with all the other indices prior to - index - 1 - - :param int index: generate masks from this image - :param boolean ignore_abundance - If the sketches are not abundance weighted, or ignore_abundance=True, - compute Jaccard similarity. - - If the sketches are abundance weighted, calculate the angular - similarity. - :param boolean downsample by scaled if True - :param siglist list of signatures - :return: list of similarities for the combinations of signature at index - with rest of the signatures from index+1 - """ - startt = time.time() - sig_iterator = itertools.product([siglist[index]], siglist[index + 1:]) - func = partial(similarity_args_unpack, - ignore_abundance=ignore_abundance, - downsample=downsample) - similarity_list = list(map(func, sig_iterator)) - notify( - "comparison for index {} done in {:.5f} seconds", - index, - time.time() - startt, - end='\r') - return similarity_list - - def compare_parallel(siglist, ignore_abundance, downsample, n_jobs): """Compare all combinations of signatures and return a matrix of similarities. Processes combinations parallely on number of processes @@ -159,15 +127,35 @@ def compare_parallel(siglist, ignore_abundance, downsample, n_jobs): memmap_similarities, filename = to_memmap(similarities) notify("Initialized memmapped similarities matrix") - # Initialize the function using func.partial with the common arguments like - # siglist, ignore_abundance, downsample, for computing all the signatures - # The only changing parameter that will be mapped from the pool is the index - func = partial( - get_similarities_at_index, - siglist=siglist, - ignore_abundance=ignore_abundance, - downsample=downsample) - notify("Created similarity func") + def get_similarities_at_index(index): + """Returns similarities of all the combinations of signature at index in + the siglist with the rest of the indices starting at index + 1. Doesn't + redundantly calculate signatures with all the other indices prior to + index - 1 + + :param int index: generate masks from this image + :param boolean ignore_abundance + If the sketches are not abundance weighted, or ignore_abundance=True, + compute Jaccard similarity. + + If the sketches are abundance weighted, calculate the angular + similarity. + :param boolean downsample by scaled if True + :return: list of similarities for the combinations of signature at index + with rest of the signatures from index+1 + """ + startt = time.time() + sig_iterator = itertools.product([siglist[index]], siglist[index + 1:]) + func = partial(similarity_args_unpack, + ignore_abundance=ignore_abundance, + downsample=downsample) + similarity_list = list(map(func, sig_iterator)) + notify( + "comparison for index {} done in {:.5f} seconds", + index, + time.time() - startt, + end='\r') + return similarity_list # Initialize multiprocess.pool pool = multiprocessing.Pool(processes=n_jobs) @@ -179,7 +167,7 @@ def compare_parallel(siglist, ignore_abundance, downsample, n_jobs): notify("Calculated chunk size for multiprocessing") # This will not generate the results yet, since pool.imap returns a generator - result = pool.imap(func, range(length_siglist), chunksize=chunksize) + result = pool.imap(get_similarities_at_index, range(length_siglist), chunksize=chunksize) notify("Initialized multiprocessing pool.imap") # Enumerate and calculate similarities at each of the indices From b7832d7593e90e2a5a71c576097b6d565c0ae1dd Mon Sep 17 00:00:00 2001 From: pranathivemuri Date: Mon, 3 May 2021 11:40:25 -0700 Subject: [PATCH 2/4] Added comments --- src/sourmash/compare.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/sourmash/compare.py b/src/sourmash/compare.py index 09ee6dee6f..4ae49d1444 100644 --- a/src/sourmash/compare.py +++ b/src/sourmash/compare.py @@ -127,6 +127,8 @@ def compare_parallel(siglist, ignore_abundance, downsample, n_jobs): memmap_similarities, filename = to_memmap(similarities) notify("Initialized memmapped similarities matrix") + # To avoid sharing siglist via pickle declaring a function inside function + def get_similarities_at_index(index): """Returns similarities of all the combinations of signature at index in the siglist with the rest of the indices starting at index + 1. Doesn't From d89eab745a06f68df927e01f8d8134a7dbca6dc0 Mon Sep 17 00:00:00 2001 From: Pranathi Vemuri Date: Sun, 9 May 2021 14:02:15 -0700 Subject: [PATCH 3/4] Update src/sourmash/compare.py --- src/sourmash/compare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sourmash/compare.py b/src/sourmash/compare.py index 4ae49d1444..c32f6a13be 100644 --- a/src/sourmash/compare.py +++ b/src/sourmash/compare.py @@ -128,7 +128,7 @@ def compare_parallel(siglist, ignore_abundance, downsample, n_jobs): notify("Initialized memmapped similarities matrix") # To avoid sharing siglist via pickle declaring a function inside function - +global get_similarities_at_index def get_similarities_at_index(index): """Returns similarities of all the combinations of signature at index in the siglist with the rest of the indices starting at index + 1. Doesn't From 613b03620e3a6dd36a8fbe0b2409409d1690dedf Mon Sep 17 00:00:00 2001 From: Pranathi Vemuri Date: Sun, 9 May 2021 14:02:44 -0700 Subject: [PATCH 4/4] Update src/sourmash/compare.py --- src/sourmash/compare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sourmash/compare.py b/src/sourmash/compare.py index c32f6a13be..56e48c85f5 100644 --- a/src/sourmash/compare.py +++ b/src/sourmash/compare.py @@ -128,7 +128,7 @@ def compare_parallel(siglist, ignore_abundance, downsample, n_jobs): notify("Initialized memmapped similarities matrix") # To avoid sharing siglist via pickle declaring a function inside function -global get_similarities_at_index + global get_similarities_at_index def get_similarities_at_index(index): """Returns similarities of all the combinations of signature at index in the siglist with the rest of the indices starting at index + 1. Doesn't