mapping-commons · cthoyt · Sep 22, 2021 · Sep 13, 2021 · Sep 13, 2021 · Sep 13, 2021
diff --git a/.github/workflows/qc.yml b/.github/workflows/qc.yml
@@ -25,5 +25,7 @@ jobs:
       run: pip install tox
     - name: Lint with flake8
       run: tox -e flake8
+    - name: Test with MyPy
+      run: tox -e mypy
     - name: Test with pytest
       run: tox -e py
diff --git a/.gitignore b/.gitignore
@@ -19,3 +19,4 @@ sphinx/_build
 sphinx/_static
 sphinx/_templates
 docs/_build
+.vscode/*
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,3 +5,10 @@ target-version = ['py36', 'py37', 'py38', 'py39']
 [tool.isort]
 profile = "black"
 multi_line_output = 3
+
+[[tool.mypy.overrides]]
+module = [
+    'sssom.sssom_datamodel',
+    'sssom.cliquesummary'
+]
+ignore_errors = true
diff --git a/sssom/context.py b/sssom/context.py
@@ -1,6 +1,6 @@
 import json
 import logging
-from typing import Any, Mapping, Tuple
+from typing import Any, Dict, Mapping, Tuple
 
 from .external_context import sssom_external_context
 from .internal_context import sssom_context
@@ -49,7 +49,7 @@ def get_default_metadata() -> Tuple[Mapping[str, Any], Mapping[str, Any]]:
     contxt = get_jsonld_context()
     contxt_external = get_external_jsonld_context()
     curie_map = {}
-    meta = {}
+    meta: Dict[str, str] = {}
     for key in contxt["@context"]:
         v = contxt["@context"][key]
         if isinstance(v, str):

diff --git a/sssom/util.py b/sssom/util.py
@@ -7,7 +7,7 @@
 import sys
 from dataclasses import dataclass
 from io import FileIO, StringIO
-from typing import Any, Dict, List, Mapping, Optional, Set, Union
+from typing import Any, Dict, List, Mapping, Optional, Set, Tuple
 from urllib.request import urlopen
 
 import numpy as np
@@ -63,17 +63,20 @@ class MappingSetDataFrame:
     A collection of mappings represented as a DataFrame, together with additional metadata
     """
 
-    df: pd.DataFrame = None  # Mappings
-    prefixmap: Dict[str, str] = None  # maps CURIE prefixes to URI bases
+    df: Optional[pd.DataFrame] = None  # Mappings
+    prefixmap: Dict[str, Any] = None  # maps CURIE prefixes to URI bases
     metadata: Optional[Dict[str, str]] = None  # header metadata excluding prefixes
 
-    def merge(self, msdf2, inplace=True):
+    def merge(
+        self, msdf2: "MappingSetDataFrame", inplace: bool = True
+    ) -> "MappingSetDataFrame":
         """Merges two MappingSetDataframes
 
         Args:
-            msdf2 (MappingSetDataFrame): Secondary MappingSetDataFrame (self => primary)
-            inplace (bool): if true, msdf2 is merged into the calling MappingSetDataFrame, if false, it simply return
-                            the merged data frame.
+            msdf: Secondary MappingSetDataFrame (self => primary)
+            inplace:
+                if true, msdf2 is merged into the calling MappingSetDataFrame, if false, it simply return
+                the merged data frame.
 
         Returns:
             MappingSetDataFrame: Merged MappingSetDataFrame
@@ -83,6 +86,7 @@ def merge(self, msdf2, inplace=True):
             self.df = msdf.df
             self.prefixmap = msdf.prefixmap
             self.metadata = msdf.metadata
+            # FIXME should return self if inplace
         return msdf
 
     def __str__(self):
@@ -142,9 +146,9 @@ class MappingSetDiff:
     this is considered a mapping in common.
     """
 
-    unique_tuples1: Optional[Set[str]] = None
-    unique_tuples2: Optional[Set[str]] = None
-    common_tuples: Optional[Set[str]] = None
+    unique_tuples1: Optional[Set[EntityPair]] = None
+    unique_tuples2: Optional[Set[EntityPair]] = None
+    common_tuples: Optional[Set[EntityPair]] = None
 
     combined_dataframe: Optional[pd.DataFrame] = None
     """
@@ -171,6 +175,8 @@ def load(self, filename) -> None:
         self.df = read_pandas(filename)
 
     def convert(self) -> Dict[str, Any]:
+        if self.df is None:
+            raise RuntimeError("dataframe is not loaded properly")
         # note that 'mapping' is both a metaproperty and a property of this model...
         cslots = {
             "mappings": {
@@ -180,7 +186,7 @@ def convert(self) -> Dict[str, Any]:
             },
             "id": {"description": "CURIE or IRI identifier", "identifier": True},
         }
-        classes = {
+        classes: Dict[str, Any] = {
             "mapping set": {
                 "description": "Represents a set of mappings",
                 "slots": ["mappings"],
@@ -310,7 +316,7 @@ def filter_redundant_rows(df: pd.DataFrame, ignore_predicate=False) -> pd.DataFr
         key = [SUBJECT_ID, OBJECT_ID, PREDICATE_ID]
     dfmax: pd.DataFrame
     dfmax = df.groupby(key, as_index=False)[CONFIDENCE].apply(max).drop_duplicates()
-    max_conf = {}
+    max_conf: Dict[Any, Any] = {}
     for _, row in dfmax.iterrows():
         if ignore_predicate:
             max_conf[(row[SUBJECT_ID], row[OBJECT_ID])] = row[CONFIDENCE]
@@ -586,13 +592,15 @@ def merge_msdf(
 
     merged_msdf = MappingSetDataFrame()
     # If msdf2 has a DataFrame
-    if msdf2.df is not None:
+    if msdf1.df is not None and msdf2.df is not None:
         # 'outer' join in pandas == FULL JOIN in SQL
         merged_msdf.df = msdf1.df.merge(msdf2.df, how="outer")
     else:
         merged_msdf.df = msdf1.df
     # merge the non DataFrame elements
-    merged_msdf.prefixmap = dict_merge(msdf2.prefixmap, msdf1.prefixmap, "prefixmap")
+    merged_msdf.prefixmap = dict_merge(
+        source=msdf2.prefixmap, target=msdf1.prefixmap, dict_name="prefixmap"
+    )
     # After a Slack convo with @matentzn, commented out below.
     # merged_msdf.metadata = dict_merge(msdf2.metadata, msdf1.metadata, 'metadata')
 
@@ -677,15 +685,13 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame:
     )[CONFIDENCE].max()
 
     # If same confidence prefer "HumanCurated".
-    reconciled_df_subset: pd.DataFrame
     reconciled_df_subset = pd.DataFrame(columns=combined_normalized_subset.columns)
     for _, row_1 in max_confidence_df.iterrows():
         match_condition_1 = (
             (combined_normalized_subset[SUBJECT_ID] == row_1[SUBJECT_ID])
             & (combined_normalized_subset[OBJECT_ID] == row_1[OBJECT_ID])
             & (combined_normalized_subset[CONFIDENCE] == row_1[CONFIDENCE])
         )
-        match_condition_1: Union[bool, ...]
         # match_condition_1[match_condition_1] gives the list of 'True's.
         # In other words, the rows that match the condition (rules declared).
         # Ideally, there should be 1 row. If not apply an extra rule to look for 'HumanCurated'.
@@ -715,12 +721,10 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame:
             & (reconciled_df_subset[OBJECT_ID] == row_2[OBJECT_ID])
             & (reconciled_df_subset[CONFIDENCE] == row_2[CONFIDENCE])
         )
-        match_condition_2: Union[bool, ...]
         reconciled_df_subset.loc[
             match_condition_2[match_condition_2].index, PREDICATE_ID
         ] = row_2[PREDICATE_ID]
 
-    reconciled_df: pd.DataFrame
     reconciled_df = pd.DataFrame(columns=df.columns)
     for _, row_3 in reconciled_df_subset.iterrows():
         match_condition_3 = (
@@ -729,28 +733,31 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame:
             & (df[CONFIDENCE] == row_3[CONFIDENCE])
             & (df[PREDICATE_ID] == row_3[PREDICATE_ID])
         )
-        match_condition_3: Union[bool, ...]
         reconciled_df = reconciled_df.append(
             df.loc[match_condition_3[match_condition_3].index, :]
         )
     return_df = reconciled_df.append(nan_df).drop_duplicates()
     return return_df
 
 
-def dict_merge(source: Dict, target: Dict, dict_name: str) -> Dict:
+def dict_merge(
+    *,
+    source: Optional[Dict[str, Any]] = None,
+    target: Dict[str, Any],
+    dict_name: str,
+) -> Dict[str, Any]:
     """
     Takes 2 MappingSetDataFrame elements (prefixmap OR metadata) and merges source => target
 
     Args:
-        source (Dict): MappingSetDataFrame.prefixmap / MappingSetDataFrame.metadata
-        target (Dict): MappingSetDataFrame.prefixmap / MappingSetDataFrame.metadata
-        dict_name (str): prefixmap or metadata
+        source: MappingSetDataFrame.prefixmap / MappingSetDataFrame.metadata
+        target: MappingSetDataFrame.prefixmap / MappingSetDataFrame.metadata
+        dict_name: prefixmap or metadata
 
     Returns:
         Dict: merged MappingSetDataFrame.prefixmap / MappingSetDataFrame.metadata
     """
     if source is not None:
-        k: str
         for k, v in source.items():
             if k not in target:
                 if v not in list(target.values()):
@@ -777,7 +784,7 @@ def inject_metadata_into_df(msdf: MappingSetDataFrame) -> MappingSetDataFrame:
     Returns:
         MappingSetDataFrame: MappingSetDataFrame with metadata as columns
     """
-    if bool(msdf.metadata):
+    if msdf.metadata is not None and msdf.df is not None:
         for k, v in msdf.metadata.items():
             if k not in msdf.df.columns:
                 msdf.df[k] = v
@@ -878,13 +885,14 @@ def to_mapping_set_dataframe(doc: MappingSetDocument) -> MappingSetDataFrame:
     # convert MappingSetDocument into MappingSetDataFrame
     ###
     data = []
-    for mapping in doc.mapping_set.mappings:
-        mdict = mapping.__dict__
-        m = {}
-        for key in mdict:
-            if mdict[key]:
-                m[key] = mdict[key]
-        data.append(m)
+    if doc.mapping_set.mappings is not None:
+        for mapping in doc.mapping_set.mappings:
+            mdict = mapping.__dict__
+            m = {}
+            for key in mdict:
+                if mdict[key]:
+                    m[key] = mdict[key]
+            data.append(m)
     df = pd.DataFrame(data=data)
     meta = extract_global_metadata(doc)
     meta.pop("curie_map", None)

diff --git a/tox.ini b/tox.ini
@@ -40,3 +40,9 @@ deps =
     flake8-bugbear
     flake8-isort
 description = Run the flake8 code quality checker.
+
+[testenv:mypy]
+deps = mypy
+skip_install = true
+commands = mypy --install-types --non-interactive --ignore-missing-imports sssom/ setup.py
+description = Run the mypy tool to check static typing on the project.