Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter prefixes #293

Merged
merged 13 commits into from
Aug 4, 2022
59 changes: 54 additions & 5 deletions sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,28 @@ def clean_prefix_map(self) -> None:
self.df = filter_out_prefixes(self.df, missing_prefixes)
self.prefix_map = new_prefixes

def remove_mappings(self, msdf: "MappingSetDataFrame"):
"""Remove mappings in right msdf from left msdf.

:param msdf: MappingSetDataframe object to be removed from primary msdf object.
"""
self.df = (
pd.merge(
self.df,
msdf.df,
on=KEY_FEATURES,
how="outer",
suffixes=("", "_2"),
indicator=True,
)
.query("_merge == 'left_only'")
.drop("_merge", axis=1)
.reset_index(drop=True)
)

self.df = self.df[self.df.columns.drop(list(self.df.filter(regex=r"_2")))]
self.clean_prefix_map()


@dataclass
class EntityPair:
Expand Down Expand Up @@ -990,27 +1012,54 @@ def get_prefixes_used_in_table(df: pd.DataFrame) -> List[str]:
return list(set(prefixes))


def filter_out_prefixes(df: pd.DataFrame, filter_prefixes: List[str]) -> pd.DataFrame:
def filter_out_prefixes(
df: pd.DataFrame, filter_prefixes: List[str], features: list = KEY_FEATURES
) -> pd.DataFrame:
"""Filter any row where a CURIE in one of the key column uses one of the given prefixes.

:param df: Pandas DataFrame
:param filter_prefixes: List of prefixes
:param features: List of dataframe column names dataframe to consider
:return: Pandas Dataframe
"""
filter_prefix_set = set(filter_prefixes)
rows = []

for _, row in df.iterrows():
# Get list of CURIEs from the 3 columns (KEY_FEATURES) for the row.
prefixes = {get_prefix_from_curie(curie) for curie in row[KEY_FEATURES]}
# Confirm if none of the 3 CURIEs in the list above appear in the filter_prefixes list.
prefixes = {get_prefix_from_curie(curie) for curie in row[features]}
# Confirm if none of the CURIEs in the list above appear in the filter_prefixes list.
# If TRUE, append row.
if not any(prefix in prefixes for prefix in filter_prefix_set):
rows.append(row)
if rows:
return pd.DataFrame(rows)
else:
return pd.DataFrame(columns=KEY_FEATURES)
return pd.DataFrame(columns=features)


def filter_prefixes(
df: pd.DataFrame, filter_prefixes: List[str], features: list = KEY_FEATURES
) -> pd.DataFrame:
"""Filter any row where a CURIE in one of the key column uses one of the given prefixes.

:param df: Pandas DataFrame
:param filter_prefixes: List of prefixes
:param features: List of dataframe column names dataframe to consider
:return: Pandas Dataframe
"""
filter_prefix_set = set(filter_prefixes)
rows = []

for _, row in df.iterrows():
prefixes = {get_prefix_from_curie(curie) for curie in row[features]}
# Confirm if all of the CURIEs in the list above appear in the filter_prefixes list.
# If TRUE, append row.
if all(prefix in filter_prefix_set for prefix in prefixes):
rows.append(row)
if rows:
return pd.DataFrame(rows)
else:
return pd.DataFrame(columns=features)


# TODO this is not used anywhere
Expand Down
36 changes: 36 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
"""Test for merging MappingSetDataFrames."""
import unittest

from sssom.constants import OBJECT_ID, SUBJECT_ID
from sssom.io import extract_iri
from sssom.parsers import parse_sssom_table
from sssom.util import MappingSetDataFrame, filter_out_prefixes, filter_prefixes
from tests.constants import data_dir


class TestIO(unittest.TestCase):
"""A test case for merging msdfs."""

def setUp(self) -> None:
"""Set up."""
self.msdf = parse_sssom_table(f"{data_dir}/basic.tsv")
self.features = [SUBJECT_ID, OBJECT_ID]

def test_broken_predicate_list(self):
"""Test merging of multiple msdfs."""
pred_filter_list = ["skos:relatedMatch", f"{data_dir}/predicate_list3.txt"]
Expand All @@ -18,3 +26,31 @@ def test_broken_predicate_list(self):
if p_iri:
iri_list.extend(p_iri)
self.assertEqual(3, len(iri_list))

def test_filter_prefixes(self):
"""Test filtering MSDF.df by prefixes provided."""
prefix_filter_list = ["x", "y"]
filtered_df = filter_prefixes(self.msdf.df, prefix_filter_list, self.features)
self.assertEqual(len(filtered_df), 40)

def test_filter_out_prefixes(self):
"""Test filtering MSDF.df by prefixes provided."""
prefix_filter_list = ["x", "y"]
filtered_df = filter_out_prefixes(
self.msdf.df, prefix_filter_list, self.features
)
self.assertEqual(len(filtered_df), 5)

def test_remove_mappings(self):
"""Test remove mappings."""
prefix_filter_list = ["x", "y"]
filtered_df = filter_out_prefixes(
self.msdf.df, prefix_filter_list, self.features
)
new_msdf = MappingSetDataFrame(
df=filtered_df, prefix_map=self.msdf.prefix_map, metadata=self.msdf.metadata
)
original_length = len(self.msdf.df)
self.msdf.remove_mappings(new_msdf)
# len(self.msdf.df) = 141 and len(new_msdf.df) = 5
self.assertEqual(len(self.msdf.df), original_length - len(new_msdf.df))