Skip to content

Commit

Permalink
Add FilterRule.all() and FilterRule.none() rules + user readable disp…
Browse files Browse the repository at this point in the history
…lay string (epic-open-source#27)
  • Loading branch information
gbowlin authored Jun 14, 2024
1 parent b8bf964 commit 90aeb0d
Show file tree
Hide file tree
Showing 3 changed files with 338 additions and 75 deletions.
1 change: 1 addition & 0 deletions changelog/27.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* Added `FilterRule.all()` and `FilterRule.none()` class methods for matching all or no rows of a dataframe.
135 changes: 122 additions & 13 deletions src/seismometer/data/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,14 @@ class FilterRule(object):
"""

MIN_ROWS: Optional[int] = 10
left: Union["FilterRule", str]
left: Union["FilterRule", str, None]
relation: str
right: Any

method_router = {
# returns a matching dataframe data, left, right
"all": lambda x, y, z: pd.Series(True, index=x.index),
"none": lambda x, y, z: pd.Series(False, index=x.index),
"isna": lambda x, y, z: x[y].isna(),
"notna": lambda x, y, z: ~x[y].isna(),
"isin": lambda x, y, z: x[y].isin(z),
Expand All @@ -32,6 +35,8 @@ class FilterRule(object):
}

inversion = {
"all": "none",
"none": "all",
"isna": "notna",
"notna": "isna",
"isin": "notin",
Expand All @@ -44,7 +49,7 @@ class FilterRule(object):
">": "<=",
}

def __init__(self, left: Union["FilterRule", str], relation: str, right: Any = None):
def __init__(self, left: Union["FilterRule", str, None], relation: str, right: Any = None):
"""
A FilterRule is a relationship that can be reused for filtering data frames.
Expand All @@ -70,6 +75,9 @@ def __init__(self, left: Union["FilterRule", str], relation: str, right: Any = N
f"NaN checking relation '{relation}' does not accept right item. Right item is of type {type(right)}"
)

if relation in ["all", "none"] and (right is not None or left is not None):
raise TypeError(f"Universal relation '{relation}' does not accept left/right items")

if relation in ["and", "or"]:
if not isinstance(left, FilterRule):
raise TypeError(
Expand All @@ -85,6 +93,9 @@ def __repr__(self) -> str:
"""
String that represents a FilterRule.
"""
if self.relation in ["all", "none"]:
return f"FilterRule.{self.relation}()"

if self.relation in ["and", "or"]:
assert isinstance(self.left, FilterRule)
assert isinstance(self.right, FilterRule)
Expand All @@ -100,11 +111,56 @@ def __repr__(self) -> str:
return f"{left} | {right}"

if self.relation in ["isna", "notna"]:
return f"FilterRule('{self.left}', '{self.relation}')"
return f"FilterRule.{self.relation}('{self.left}')"
if not isinstance(self.right, str):
return f"FilterRule('{self.left}', '{self.relation}', {self.right})"
return f"FilterRule('{self.left}', '{self.relation}', '{self.right}')"

def __str__(self) -> str:
"""
User readable string that represents a FilterRule.
>>> rule1 = FilterRule("Val", ">=", 20)
>>> rule2 = FilterRule("T/F", "==", 0)
>>> rule3 = FilterRule("Other", "<", 5)
>>> str(rule1 | (rule2 & rule3))
'Val >= 20 or (T/F is 0 and Other < 5)'
"""
match self.relation:
case "all":
return "Include all"
case "none":
return "Exclude all"
case "isna":
return f"{self.left} is missing"
case "notna":
return f"{self.left} has a value"
case "isin":
return f"{self.left} is in: {', '.join(self.right)}"
case "notin":
return f"{self.left} not in: {', '.join(self.right)}"
case "==":
return f"{self.left} is {self.right}"
case "!=":
return f"{self.left} is not {self.right}"
case rel if rel in ["<=", "<", ">=", ">"]:
return f"{self.left} {rel} {self.right}"
case "and" | "or":
assert isinstance(self.left, FilterRule)
assert isinstance(self.right, FilterRule)
left = str(self.left)
right = str(self.right)
if self.left.relation in ["and", "or"]:
left = f"({left})"
if self.right.relation in ["and", "or"]:
right = f"({right})"
if self.relation == "and":
return f"{left} and {right}"
else: # The "or" case
return f"{left} or {right}"
case _: # relation is checked in __init__, this should never be reached
raise ValueError(f"Unknown relation {self.relation}")

def filter(self, data: pd.DataFrame) -> pd.DataFrame:
"""
Filters a dataframe to only the rows matching the FilterRule.
Expand All @@ -119,11 +175,11 @@ def filter(self, data: pd.DataFrame) -> pd.DataFrame:
pd.DataFrame
Filtered DataFrame.
"""
df = data[self.mask(data)]
if not self.MIN_ROWS or len(df) > self.MIN_ROWS:
df = data.loc[self.mask(data)]
if (not self.MIN_ROWS) or (len(df) > self.MIN_ROWS):
return df
else:
return df.iloc[0:0]
return df[pd.Series(False, index=df.index)]

def mask(self, data: pd.DataFrame) -> pd.Index:
"""
Expand All @@ -143,9 +199,33 @@ def mask(self, data: pd.DataFrame) -> pd.Index:
return relation(data, self.left, self.right)

def __or__(left, right) -> "FilterRule":
if left == right:
return left
if ~left == right:
return FilterRule.all()

if left.relation == "all" or right.relation == "all":
return FilterRule.all()
if left.relation == "none":
return right
if right.relation == "none":
return left

return FilterRule(left, "or", right)

def __and__(left, right) -> "FilterRule":
if left == right:
return left
if ~left == right:
return FilterRule.none()

if left.relation == "none" or right.relation == "none":
return FilterRule.none()
if left.relation == "all":
return right
if right.relation == "all":
return left

return FilterRule(left, "and", right)

def __invert__(self) -> "FilterRule":
Expand All @@ -158,8 +238,7 @@ def __invert__(self) -> "FilterRule":
assert isinstance(self.right, FilterRule)
if self.relation == "or":
return FilterRule(~self.left, "and", ~self.right)
elif self.relation == "and":
return FilterRule(~self.left, "or", ~self.right)
return FilterRule(~self.left, "or", ~self.right)
return FilterRule(self.left, FilterRule.inversion[self.relation], self.right)

def __eq__(self, other: object) -> bool:
Expand Down Expand Up @@ -268,14 +347,44 @@ def neq(cls, column, value) -> "FilterRule":
"""
return cls(column, "!=", value)

@classmethod
def all(cls):
"""
FilterRule that selects all rows.
"""
return cls(None, "all")

@classmethod
def none(cls):
"""
FilterRule that selects no rows.
"""
return cls(None, "none")


def filter_rule_from_cohort_dictionary(cohort: dict[str, tuple[any]] | None = None) -> FilterRule:
"""
For a given dictionary, generate a matching FilterRule
Parameters
----------
cohort : dict[str,tuplep[any]], optional
A dictionary of column names and cohort category labels,
by default None, in which case FilterRule.all() is returned.
Returns
-------
FilterRule
A filter rule that verifyes that each column in the keys has a value in the set of selected categories.
"""

rule = FilterRule.all()
if not cohort:
return rule

def filter_rule_from_cohort_dictionary(cohort=dict[str, tuple[any]]):
rule = None
for key in cohort:
if not cohort[key]:
continue
if rule is None:
rule = FilterRule(key, "isin", cohort[key])
else:
rule = rule & FilterRule(key, "isin", cohort[key])
rule = rule & FilterRule.isin(key, cohort[key])
return rule
Loading

0 comments on commit 90aeb0d

Please sign in to comment.