Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update basic_search to support words that must not appear #774

Merged
merged 1 commit into from
Oct 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 54 additions & 34 deletions hed/models/basic_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,18 @@ def find_matching(series, search_string, regex=False):
""" Finds lines in the series that match the search string and returns a mask.

Syntax Rules:
- '@': Prefixing a term in the search string means the object must appear anywhere within a line.
- '@': Prefixing a term in the search string means the term must appear anywhere within a line.
- '~': Prefixing a term in the search string means the term must NOT appear within a line.
- Parentheses: Elements within parentheses must appear in the line with the same level of nesting.
eg: Search string: "(A), (B)" will match "(A), (B, C)", but not "(A, B)", since they don't
start in the same group.
e.g.: Search string: "(A), (B)" will match "(A), (B, C)", but not "(A, B)", since they don't
start in the same group.
- "LongFormTag*": A * will match any remaining word(anything but a comma or parenthesis)
- An individual term can be arbitrary regex, but it is limited to single continuous words.

Notes:
- Specific words only care about their level relative to other specific words, not overall.
e.g. "(A, B)" will find: "A, B", "(A, B)", (A, (C), B)", or ((A, B))"
- If you have no grouping or anywhere words in the search, it assumes all terms are anywhere words.
- The format of the series should match the format of the search string, whether it's in short or long form.
- To enable support for matching parent tags, ensure that both the series and search string are in long form.

Expand All @@ -33,60 +37,83 @@ def find_matching(series, search_string, regex=False):
if not regex:
# Replace *'s with a reasonable value for people who don't know regex
search_string = re.sub(r'(?<!\.)\*', '.*?', search_string)
anywhere_words, specific_words = find_words(search_string)
anywhere_words, negative_words, specific_words = find_words(search_string)
# If we have no nesting or anywhere words, assume they don't care about level
if "(" not in search_string and "@" not in search_string:
anywhere_words += specific_words
specific_words = []
delimiter_map = construct_delimiter_map(search_string, specific_words)
source_words = anywhere_words + specific_words
candidate_indexes = _verify_basic_words(series, anywhere_words, negative_words)

# Create a set of series of masks to determine which rows contain each individual word
candidate_indexes = set(series.index)

# Loop through source_words to filter down candidate_indexes
for word in source_words:
# do a basic check for all specific words(this doesn't verify word delimiters)
for word in specific_words:
matches = series.str.contains(word, regex=True)
current_word_indexes = set(matches[matches].index.tolist())

# Update candidate_indexes by taking the intersection with current_word_indexes
candidate_indexes &= current_word_indexes

if not candidate_indexes:
break

candidate_indexes = sorted(candidate_indexes)

full_mask = pd.Series(False, index=series.index)

candidate_series = series[candidate_indexes]
full_mask = pd.Series(False, index=series.index, dtype=bool)

mask = candidate_series.apply(verify_search_delimiters, args=(anywhere_words, specific_words, delimiter_map))
full_mask.loc[candidate_indexes] = mask
if candidate_indexes:
if specific_words:
candidate_series = series[candidate_indexes]
mask = candidate_series.apply(verify_search_delimiters, args=(specific_words, delimiter_map))
full_mask.loc[candidate_indexes] = mask
else:
full_mask.loc[candidate_indexes] = True

return full_mask


def _get_word_indexes(series, word):
pattern = r'(?:[ ,()]|^)' + word + r'(?:[ ,()]|$)'
matches = series.str.contains(pattern, regex=True)
return set(matches[matches].index.tolist())


def _verify_basic_words(series, anywhere_words, negative_words):
candidate_indexes = set(series.index)
for word in anywhere_words:
current_word_indexes = _get_word_indexes(series, word)
candidate_indexes &= current_word_indexes

for word in negative_words:
current_word_indexes = _get_word_indexes(series, word)
candidate_indexes -= current_word_indexes
return candidate_indexes


def find_words(search_string):
""" Extract all words in the search string. Dividing them into words that must be relative to each other,
and words that can be anywhere.
"""
Extract words in the search string based on their prefixes.

Args:
search_string (str): The search query string to parse.
Words prefixed with '@' are 'anywhere' words.
Words can be prefixed with '@' or '~'.

Returns:
tuple: A tuple containing two lists:
- anywhere_words (list of str): Words that can appear anywhere in the text.
- specific_words (list of str): Words that must appear relative to other terms.
list: A list containing three lists:
- Words prefixed with '@'
- Words prefixed with '~'
- Words with no prefix
"""
# Match sequences of characters that are not commas, parentheses, or standalone spaces.
# Match sequences of characters that are not commas or parentheses.
pattern = r'[^,()]+'
words = re.findall(pattern, search_string)

# Remove any extraneous whitespace from each word
words = [word.strip() for word in words if word.strip()]

anywhere_words = [word[1:] for word in words if word.startswith("@")]
specific_words = [word for word in words if not word.startswith("@")]
at_words = [word[1:] for word in words if word.startswith("@")]
tilde_words = [word[1:] for word in words if word.startswith("~")]
no_prefix_words = [word for word in words if not word.startswith("~") and not word.startswith("@")]

return anywhere_words, specific_words
return [at_words, tilde_words, no_prefix_words]


def check_parentheses(text):
Expand Down Expand Up @@ -180,12 +207,11 @@ def construct_delimiter_map(text, words):
return delimiter_map


def verify_search_delimiters(text, anywhere_words, specific_words, delimiter_map):
def verify_search_delimiters(text, specific_words, delimiter_map):
""" Verifies if the text contains specific words with expected delimiters between them.

Args:
text (str): The text to search in.
anywhere_words (list of str): Words that can appear anywhere in the text.
specific_words (list of str): Words that must appear relative to other words in the text
delimiter_map (dict): A dictionary specifying expected delimiters between pairs of specific words.

Expand All @@ -194,12 +220,6 @@ def verify_search_delimiters(text, anywhere_words, specific_words, delimiter_map
"""
locations = defaultdict(list)

# Check for anywhere words
for word in anywhere_words:
pattern = r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)'
if not any(re.finditer(pattern, text)):
return False

# Find all locations for each word in the text
for word in specific_words:
for match in re.finditer(r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)', text):
Expand Down
Loading