Skip to content

Commit

Permalink
Migrated bin_ids_forming_sequence from sequences.seqops to new sequen…
Browse files Browse the repository at this point in the history
…ces.choices, updating dependencies
  • Loading branch information
timbernat committed Jun 18, 2024
1 parent 541b944 commit 0f7bb23
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 43 deletions.
45 changes: 45 additions & 0 deletions polymerist/genutils/sequences/choices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
'''Enumeration algorithms for choosing a sequence of symbols out of a sequence of sets of those symbols'''

from typing import Generator, Iterable, Sequence, TypeVar
T = TypeVar('T') # generic type for sequence element

from copy import deepcopy
from containers import defaultdict, Counter
from itertools import product as cartesian_product

from .seqops import is_unique


def bin_ids_forming_sequence(sequence : Sequence[T], choice_bins : Sequence[Iterable[T]], draw_without_repeats : bool=True, unique_bins : bool=False) -> Generator[tuple[int, ...], None, None]:
'''
Takes an ordered sequence of N objects of a given type and an ordered of any number of bins, each containing an arbitary amount of unordered objects of the same type
Generates all possible N-tuples of bin indices which could produce the target sequence when drawing from those bins in the
if draw_without_repeats=True, will respect the multiplicity of elements in each bin when drawing
(i.e. will never have a bin position appear for a given object more times that that object appears in the corresponding bin)
if unique_bins=True, will only allow each bin to be sampled from once, EVEN if that bin contains elements which may occur later in the sequence
'''
symbol_inventory = defaultdict(Counter) # keys are objects of type T ("symbols"), values give multiplicities of symbols keyed by bin position
for i, choice_bin in enumerate(choice_bins):
for sym in choice_bin:
symbol_inventory[sym][i] += 1 # NOTE : implementation here requires that T be a hashable type

for idxs in cartesian_product(*(symbol_inventory[item].keys() for item in sequence)): # generate every valid sequence of bin positions WITHOUT regard to repetition or uniqueness
if unique_bins and not is_unique(idxs):
continue # skip non-unique bin choices if the option is set

if draw_without_repeats:
choice_inventory = deepcopy(symbol_inventory) # make a new copy for each path check
for i, sym in zip(idxs, sequence, strict=True):
if choice_inventory[sym][i] == 0:
overdrawn = True
break
choice_inventory[sym][i] -= 1
else:
overdrawn = False

if overdrawn:
continue

yield idxs # only yield if all specified conditions are met
45 changes: 4 additions & 41 deletions polymerist/genutils/sequences/seqops.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
'''Generic operations for indexing, generating, and iterating over sequences'''

from typing import Generator, Iterable, Sequence, TypeVar, Union
from typing import Generator, Sequence, TypeVar, Union
T = TypeVar('T') # generic type for sequence element
S = TypeVar('S') # generic type for a distinct sequence element

from copy import deepcopy
from collections import defaultdict, Counter
from itertools import count, product as cartesian_product
from itertools import count


def is_unique(seq : Sequence) -> bool:
Expand Down Expand Up @@ -39,42 +36,8 @@ def cycle_items(seq : Sequence[T], places : int=1) -> list[T]:
cycle_items([1,2,3,4], 2) -> [3,4,1,2]
cycle_items([1,2,3,4], -1) -> [4,1,2,3]
'''
n_items = len(seq) # this length call is what requires the input to be a Seuquence and not just an Iterable
n_items = len(seq) # this length call is what requires the input to be a Sequence and not just an Iterable
return [
seq[i % n_items]
for i in range(places, places + n_items)
]

def bin_ids_forming_sequence(sequence : Sequence[T], choice_bins : Sequence[Iterable[T]], draw_without_repeats : bool=True, unique_bins : bool=False) -> Generator[tuple[int, ...], None, None]:
'''
Takes an ordered sequence of N objects of a given type and an ordered of any number of bins, each containing an arbitary amount of unordered objects of the same type
Generates all possible N-tuples of bin indices which could produce the target sequence when drawing from those bins in the
if draw_without_repeats=True, will respect the multiplicity of elements in each bin when drawing
(i.e. will never have a bin position appear for a given object more times that that object appears in the corresponding bin)
if unique_bins=True, will only allow each bin to be sampled from once, EVEN if that bin contains elements which may occur later in the sequence
'''
symbol_inventory = defaultdict(Counter) # keys are objects of type T ("symbols"), values give multiplicities of symbols keyed by bin position
for i, choice_bin in enumerate(choice_bins):
for sym in choice_bin:
symbol_inventory[sym][i] += 1 # NOTE : implementation here requires that T be a hashable type

for idxs in cartesian_product(*(symbol_inventory[item].keys() for item in sequence)): # generate every valid sequence of bin positions WITHOUT regard to repetition or uniqueness
if unique_bins and not is_unique(idxs):
continue # skip non-unique bin choices if the option is set

if draw_without_repeats:
choice_inventory = deepcopy(symbol_inventory) # make a new copy for each path check
for i, sym in zip(idxs, sequence, strict=True):
if choice_inventory[sym][i] == 0:
overdrawn = True
break
choice_inventory[sym][i] -= 1
else:
overdrawn = False

if overdrawn:
continue

yield idxs # only yield if all specified conditions are met
]
2 changes: 1 addition & 1 deletion polymerist/polymers/monographs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from itertools import product as cartesian_product

from ..genutils.iteration import asiterable
from ..genutils.sequences.seqops import bin_ids_forming_sequence
from ..genutils.sequences.choices import bin_ids_forming_sequence
from ..genutils.textual.delimiters import validate_braces
from ..genutils.fileutils.jsonio.serialize import TypeSerializer

Expand Down
2 changes: 1 addition & 1 deletion polymerist/rdutils/reactions/reactions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from ..labeling.bondwise import get_bonded_pairs_by_map_nums

from ...genutils.decorators.functional import allow_string_paths, allow_pathlib_paths
from ...genutils.sequences.seqops import bin_ids_forming_sequence
from ...genutils.sequences.choices import bin_ids_forming_sequence
from ...smileslib.substructures import matching_labels_from_substruct_dict


Expand Down

0 comments on commit 0f7bb23

Please sign in to comment.