Skip to content

Commit

Permalink
Merge pull request #98 from mapping-commons/issue_97
Browse files Browse the repository at this point in the history
accommodating for blank confidence #97
  • Loading branch information
matentzn authored Jul 23, 2021
2 parents 6c71967 + e0902cc commit 72e3f93
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 11 deletions.
5 changes: 5 additions & 0 deletions sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
is_curie,
)
from .util import RDF_FORMATS
import numpy as np

cwd = os.path.abspath(os.path.dirname(__file__))

Expand All @@ -37,6 +38,10 @@ def from_tsv(
"""
if validators.url(file_path) or os.path.exists(file_path):
df = read_pandas(file_path)

if 'confidence' in df.columns:
df['confidence'].replace(r'^\s*$', np.NaN, regex=True, inplace=True)

if meta is None:
meta = _read_metadata_from_table(file_path)
if "curie_map" in meta:
Expand Down
34 changes: 27 additions & 7 deletions sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import pandas as pd
import yaml
import numpy as np

from sssom.sssom_datamodel import Entity, slots
from .sssom_document import MappingSetDocument
Expand Down Expand Up @@ -287,6 +288,7 @@ def filter_redundant_rows(df: pd.DataFrame, ignore_predicate=False) -> pd.DataFr
# create a 'sort' method and then replce the following line by sort()
df = sort_sssom(df)
# df[CONFIDENCE] = df[CONFIDENCE].apply(lambda x: x + random.random() / 10000)
df, nan_df = assign_default_confidence(df)
if ignore_predicate:
key = [SUBJECT_ID, OBJECT_ID]
else:
Expand All @@ -302,21 +304,35 @@ def filter_redundant_rows(df: pd.DataFrame, ignore_predicate=False) -> pd.DataFr
CONFIDENCE
]
if ignore_predicate:
return df[
df[
df.apply(
lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID])],
axis=1,
)
]
else:
return df[
df[
df.apply(
lambda x: x[CONFIDENCE]
>= max_conf[(x[SUBJECT_ID], x[OBJECT_ID], x[PREDICATE_ID])],
axis=1,
)
]

# We are preserving confidence = NaN rows without making assumptions.
# This means that there are potential duplicate mappings
return_df = df.append(nan_df).drop_duplicates()
return return_df

def assign_default_confidence(df:pd.DataFrame):
# Get rows having numpy.NaN as confidence
if df is not None and 'confidence' not in df.columns:
df['confidence'] = np.NaN

nan_df = df[df['confidence'].isna()]
if nan_df is None:
nan_df = pd.DataFrame(columns=df.columns)
return df, nan_df


def remove_unmatched(df: pd.DataFrame) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -543,9 +559,8 @@ def merge_msdf(

if reconcile:
merged_msdf.df = filter_redundant_rows(merged_msdf.df)

merged_msdf.df = deal_with_negation(merged_msdf.df) # deals with negation

return merged_msdf


Expand Down Expand Up @@ -576,7 +591,12 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame:
#1; #2(i) #3 and $4 are taken care of by 'filtered_merged_df' Only #2(ii) should be performed here.
"""
# Handle DataFrames with no 'confidence' column
df, nan_df = assign_default_confidence(df)

if df is None:
raise(Exception('Illegal dataframe (deal_with_negation'))

# If s,!p,o and s,p,o , then prefer higher confidence and remove the other. ###
negation_df: pd.DataFrame
negation_df = df.loc[
Expand Down Expand Up @@ -664,8 +684,8 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame:
reconciled_df = reconciled_df.append(
df.loc[match_condition_3[match_condition_3].index, :]
)

return reconciled_df
return_df = reconciled_df.append(nan_df).drop_duplicates()
return return_df


def dict_merge(source: Dict, target: Dict, dict_name: str) -> Dict:
Expand Down
2 changes: 1 addition & 1 deletion tests/data/basic.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ x:bone_element bone element owl:equivalentClass z:bone_tissue Lexical x z rdf_
x:bone_element bone element owl:equivalentClass z:bone_tissue Lexical|Stemming x z rdf_matcher 0.2 oio:hasBroadSynonym oio:hasBroadSynonym biolink:AnatomicalEntity biolink:AnatomicalEntity bone .
x:bone_tissue bone tissue owl:equivalentClass y:bone bones Lexical|Stemming x y rdf_matcher 0.534601961 oio:hasBroadSynonym rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity bone .
x:bone_tissue bone tissue owl:equivalentClass z:bone_element Lexical x z rdf_matcher 0.261203875 oio:hasBroadSynonym oio:hasBroadSynonym biolink:AnatomicalEntity biolink:AnatomicalEntity bone .
x:bone_tissue bone tissue owl:equivalentClass z:bone_element Lexical|Stemming x z rdf_matcher 0.2 oio:hasBroadSynonym oio:hasBroadSynonym biolink:AnatomicalEntity biolink:AnatomicalEntity bone .
x:bone_tissue bone tissue owl:equivalentClass z:bone_element Lexical|Stemming x z rdf_matcher oio:hasBroadSynonym oio:hasBroadSynonym biolink:AnatomicalEntity biolink:AnatomicalEntity bone .
x:bone_tissue bone tissue owl:equivalentClass z:bone_tissue Lexical x z rdf_matcher 0.738796125 oio:hasDbXref oio:hasDbXref biolink:AnatomicalEntity biolink:AnatomicalEntity UBERON:0002481 .
x:bone_tissue bone tissue owl:equivalentClass z:bone_tissue Lexical x z rdf_matcher 0.261203875 oio:hasBroadSynonym oio:hasBroadSynonym biolink:AnatomicalEntity biolink:AnatomicalEntity bone .
x:bone_tissue bone tissue owl:equivalentClass z:bone_tissue Lexical|Stemming x z rdf_matcher 0.2 oio:hasBroadSynonym oio:hasBroadSynonym biolink:AnatomicalEntity biolink:AnatomicalEntity bone .
Expand Down
2 changes: 1 addition & 1 deletion tests/test_collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def test_collapse(self):
def test_filter(self):
df = filter_redundant_rows(self.df)
print(df[0:20])
assert len(df.index) == 91
assert len(df.index) == 141

def test_ptable(self):
rows = dataframe_to_ptable(self.df)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_reconcile.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_df(self):
def test_filter(self):
df = filter_redundant_rows(self.msdf.df)
print(df[0:20])
assert len(df.index) == 10
assert len(df.index) == 11

def test_deal_with_negation(self):
df = deal_with_negation(self.msdf.df)
Expand All @@ -37,4 +37,4 @@ def test_merge(self):

merged_msdf = merge_msdf(msdf1=msdf1, msdf2=msdf2)

assert len(merged_msdf.df) == 94
assert len(merged_msdf.df) == 95

0 comments on commit 72e3f93

Please sign in to comment.