Merge pull request #98 from mapping-commons/issue_97

accommodating for blank confidence #97
mapping-commons · Jul 23, 2021 · 72e3f93 · 72e3f93
2 parents 6c71967 + e0902cc
commit 72e3f93
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 11 deletions.
diff --git a/sssom/parsers.py b/sssom/parsers.py
@@ -22,6 +22,7 @@
     is_curie,
 )
 from .util import RDF_FORMATS
+import numpy as np
 
 cwd = os.path.abspath(os.path.dirname(__file__))
 
@@ -37,6 +38,10 @@ def from_tsv(
     """
     if validators.url(file_path) or os.path.exists(file_path):
         df = read_pandas(file_path)
+
+        if 'confidence' in df.columns:
+            df['confidence'].replace(r'^\s*$', np.NaN, regex=True, inplace=True)
+
         if meta is None:
             meta = _read_metadata_from_table(file_path)
         if "curie_map" in meta:

diff --git a/sssom/util.py b/sssom/util.py
@@ -9,6 +9,7 @@
 
 import pandas as pd
 import yaml
+import numpy as np
 
 from sssom.sssom_datamodel import Entity, slots
 from .sssom_document import MappingSetDocument
@@ -287,6 +288,7 @@ def filter_redundant_rows(df: pd.DataFrame, ignore_predicate=False) -> pd.DataFr
     # create a 'sort' method and then replce the following line by sort()
     df = sort_sssom(df)
     # df[CONFIDENCE] = df[CONFIDENCE].apply(lambda x: x + random.random() / 10000)
+    df, nan_df = assign_default_confidence(df)
     if ignore_predicate:
         key = [SUBJECT_ID, OBJECT_ID]
     else:
@@ -302,21 +304,35 @@ def filter_redundant_rows(df: pd.DataFrame, ignore_predicate=False) -> pd.DataFr
                 CONFIDENCE
             ]
     if ignore_predicate:
-        return df[
+        df[
             df.apply(
                 lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID])],
                 axis=1,
             )
         ]
     else:
-        return df[
+        df[
             df.apply(
                 lambda x: x[CONFIDENCE]
                 >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID], x[PREDICATE_ID])],
                 axis=1,
             )
         ]
-
+    # We are preserving confidence = NaN rows without making assumptions. 
+    # This means that there are potential duplicate mappings
+    return_df = df.append(nan_df).drop_duplicates()
+    return return_df
+
+def assign_default_confidence(df:pd.DataFrame):
+    # Get rows having numpy.NaN as confidence
+    if df is not None and 'confidence' not in df.columns:
+        df['confidence'] = np.NaN
+
+    nan_df = df[df['confidence'].isna()]
+    if nan_df is None:
+        nan_df = pd.DataFrame(columns=df.columns)
+    return df, nan_df
+
 
 def remove_unmatched(df: pd.DataFrame) -> pd.DataFrame:
     """
@@ -543,9 +559,8 @@ def merge_msdf(
 
     if reconcile:
         merged_msdf.df = filter_redundant_rows(merged_msdf.df)
-
         merged_msdf.df = deal_with_negation(merged_msdf.df)  # deals with negation
-
+        
     return merged_msdf
 
 
@@ -576,7 +591,12 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame:
 
             #1; #2(i) #3 and $4 are taken care of by 'filtered_merged_df' Only #2(ii) should be performed here.
         """
+    # Handle DataFrames with no 'confidence' column
+    df, nan_df = assign_default_confidence(df)
 
+    if df is None:
+        raise(Exception('Illegal dataframe (deal_with_negation'))
+
     #  If s,!p,o and s,p,o , then prefer higher confidence and remove the other.  ###
     negation_df: pd.DataFrame
     negation_df = df.loc[
@@ -664,8 +684,8 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame:
         reconciled_df = reconciled_df.append(
             df.loc[match_condition_3[match_condition_3].index, :]
         )
-
-    return reconciled_df
+    return_df = reconciled_df.append(nan_df).drop_duplicates()
+    return return_df
 
 
 def dict_merge(source: Dict, target: Dict, dict_name: str) -> Dict:

diff --git a/tests/data/basic.tsv b/tests/data/basic.tsv
@@ -27,7 +27,7 @@ x:bone_element	bone element	owl:equivalentClass		z:bone_tissue		Lexical	x	z	rdf_
 x:bone_element	bone element	owl:equivalentClass		z:bone_tissue		Lexical|Stemming	x	z	rdf_matcher	0.2	oio:hasBroadSynonym	oio:hasBroadSynonym	biolink:AnatomicalEntity	biolink:AnatomicalEntity	bone	.
 x:bone_tissue	bone tissue	owl:equivalentClass		y:bone	bones	Lexical|Stemming	x	y	rdf_matcher	0.534601961	oio:hasBroadSynonym	rdfs:label	biolink:AnatomicalEntity	biolink:AnatomicalEntity	bone	.
 x:bone_tissue	bone tissue	owl:equivalentClass		z:bone_element		Lexical	x	z	rdf_matcher	0.261203875	oio:hasBroadSynonym	oio:hasBroadSynonym	biolink:AnatomicalEntity	biolink:AnatomicalEntity	bone	.
-x:bone_tissue	bone tissue	owl:equivalentClass		z:bone_element		Lexical|Stemming	x	z	rdf_matcher	0.2	oio:hasBroadSynonym	oio:hasBroadSynonym	biolink:AnatomicalEntity	biolink:AnatomicalEntity	bone	.
+x:bone_tissue	bone tissue	owl:equivalentClass		z:bone_element		Lexical|Stemming	x	z	rdf_matcher		oio:hasBroadSynonym	oio:hasBroadSynonym	biolink:AnatomicalEntity	biolink:AnatomicalEntity	bone	.
 x:bone_tissue	bone tissue	owl:equivalentClass		z:bone_tissue		Lexical	x	z	rdf_matcher	0.738796125	oio:hasDbXref	oio:hasDbXref	biolink:AnatomicalEntity	biolink:AnatomicalEntity	UBERON:0002481	.
 x:bone_tissue	bone tissue	owl:equivalentClass		z:bone_tissue		Lexical	x	z	rdf_matcher	0.261203875	oio:hasBroadSynonym	oio:hasBroadSynonym	biolink:AnatomicalEntity	biolink:AnatomicalEntity	bone	.
 x:bone_tissue	bone tissue	owl:equivalentClass		z:bone_tissue		Lexical|Stemming	x	z	rdf_matcher	0.2	oio:hasBroadSynonym	oio:hasBroadSynonym	biolink:AnatomicalEntity	biolink:AnatomicalEntity	bone	.

diff --git a/tests/test_collapse.py b/tests/test_collapse.py
@@ -32,7 +32,7 @@ def test_collapse(self):
     def test_filter(self):
         df = filter_redundant_rows(self.df)
         print(df[0:20])
-        assert len(df.index) == 91
+        assert len(df.index) == 141
 
     def test_ptable(self):
         rows = dataframe_to_ptable(self.df)

diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py
@@ -25,7 +25,7 @@ def test_df(self):
     def test_filter(self):
         df = filter_redundant_rows(self.msdf.df)
         print(df[0:20])
-        assert len(df.index) == 10
+        assert len(df.index) == 11
 
     def test_deal_with_negation(self):
         df = deal_with_negation(self.msdf.df)
@@ -37,4 +37,4 @@ def test_merge(self):
 
         merged_msdf = merge_msdf(msdf1=msdf1, msdf2=msdf2)
 
-        assert len(merged_msdf.df) == 94
+        assert len(merged_msdf.df) == 95