diff --git a/.github/workflows/qc.yml b/.github/workflows/qc.yml index 4c599ee4..95a4425e 100644 --- a/.github/workflows/qc.yml +++ b/.github/workflows/qc.yml @@ -27,5 +27,7 @@ jobs: pip install --upgrade tox - name: Lint with flake8 run: tox -e flake8 + - name: Test with MyPy + run: tox -e mypy - name: Test with pytest run: tox -e py diff --git a/.gitignore b/.gitignore index d53a9366..e8667fc6 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,4 @@ sphinx/_build sphinx/_static sphinx/_templates docs/_build -.vscode/ +.vscode/* diff --git a/Makefile b/Makefile index 4b55a4f7..ca7fcc13 100644 --- a/Makefile +++ b/Makefile @@ -74,6 +74,11 @@ lint: pip install tox tox -e lint +.PHONY: mypy +mypy: + pip install tox + tox -e mypy + .PHONY: sphinx sphinx: cd sphinx &&\ diff --git a/pyproject.toml b/pyproject.toml index 5ad18245..61bf6afe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,3 +5,10 @@ target-version = ['py36', 'py37', 'py38', 'py39'] [tool.isort] profile = "black" multi_line_output = 3 + +[[tool.mypy.overrides]] +module = [ + 'sssom.sssom_datamodel', + 'sssom.cliquesummary' +] +ignore_errors = true diff --git a/sssom/cli.py b/sssom/cli.py index a4ea72b5..be1b9dad 100644 --- a/sssom/cli.py +++ b/sssom/cli.py @@ -2,7 +2,7 @@ import re import sys from pathlib import Path -from typing import Dict, List, TextIO, Tuple +from typing import Dict, List, Sequence, TextIO, Tuple import click import pandas as pd @@ -313,25 +313,9 @@ def sparql( prefix: List[Dict[str, str]], output: TextIO, ): - """Run a SPARQL query. - - Args: - - url (str): - config (str): - graph (str): - limit (int): - object_labels (bool): - prefix (List): - output (str): Output TSV/SSSOM file. - - - Returns: - - None. - """ - - endpoint = EndpointConfig() + """Run a SPARQL query.""" + # FIXME this usage needs _serious_ refactoring + endpoint = EndpointConfig() # type: ignore if config is not None: for k, v in yaml.safe_load(config).items(): setattr(endpoint, k, v) @@ -377,9 +361,16 @@ def diff(inputs: Tuple[str, str], output: TextIO): msdf1 = read_sssom_table(input1) msdf2 = read_sssom_table(input2) d = compare_dataframes(msdf1.df, msdf2.df) - logging.info( - f"COMMON: {len(d.common_tuples)} UNIQUE_1: {len(d.unique_tuples1)} UNIQUE_2: {len(d.unique_tuples2)}" - ) + if d.combined_dataframe is None: + raise RuntimeError + if ( + d.common_tuples is not None + and d.unique_tuples1 is not None + and d.unique_tuples2 is not None + ): + logging.info( + f"COMMON: {len(d.common_tuples)} UNIQUE_1: {len(d.unique_tuples1)} UNIQUE_2: {len(d.unique_tuples2)}" + ) d.combined_dataframe.to_csv(output, sep="\t", index=False) @@ -549,7 +540,7 @@ def correlations(input: str, output: TextIO, transpose: bool, fields: Tuple): help="Boolean indicating the need for reconciliation of the SSSOM tsv file.", ) @output_option -def merge(inputs: Tuple[str, str], output: TextIO, reconcile: bool = True): +def merge(inputs: Sequence[str], output: TextIO, reconcile: bool = True): """ Merging msdf2 into msdf1, if reconcile=True, then dedupe(remove redundant lower confidence mappings) and diff --git a/sssom/cliques.py b/sssom/cliques.py index df01594d..1f401271 100644 --- a/sssom/cliques.py +++ b/sssom/cliques.py @@ -1,12 +1,12 @@ -import collections import hashlib import statistics +from typing import Any, Dict import networkx as nx import pandas as pd from .parsers import to_mapping_set_document -from .sssom_datamodel import MappingSet +from .sssom_datamodel import Mapping, MappingSet from .sssom_document import MappingSetDocument from .util import MappingSetDataFrame @@ -18,41 +18,44 @@ def to_networkx(msdf: MappingSetDataFrame) -> nx.DiGraph: # m = { # "owl:subClassOf", # } - for mapping in doc.mapping_set.mappings: - s = mapping.subject_id - o = mapping.object_id - p = mapping.predicate_id - # TODO: this is copypastad from export_ptable - - pi = None - - if p == "owl:equivalentClass": - pi = 2 - elif p == "skos:exactMatch": - pi = 2 - elif p == "skos:closeMatch": - # TODO: consider distributing - pi = 2 - elif p == "owl:subClassOf": - pi = 0 - elif p == "skos:broadMatch": - pi = 0 - elif p == "inverseOf(owl:subClassOf)": - pi = 1 - elif p == "skos:narrowMatch": - pi = 1 - elif p == "owl:differentFrom": - pi = 3 - elif p == "dbpedia-owl:different": - pi = 3 - - if pi == 0: - g.add_edge(o, s) - elif pi == 1: - g.add_edge(s, o) - elif pi == 2: - g.add_edge(s, o) - g.add_edge(o, s) + if doc.mapping_set.mappings is not None: + for mapping in doc.mapping_set.mappings: + if not isinstance(mapping, Mapping): + raise TypeError + s = mapping.subject_id + o = mapping.object_id + p = mapping.predicate_id + # TODO: this is copypastad from export_ptable + + pi = None + + if p == "owl:equivalentClass": + pi = 2 + elif p == "skos:exactMatch": + pi = 2 + elif p == "skos:closeMatch": + # TODO: consider distributing + pi = 2 + elif p == "owl:subClassOf": + pi = 0 + elif p == "skos:broadMatch": + pi = 0 + elif p == "inverseOf(owl:subClassOf)": + pi = 1 + elif p == "skos:narrowMatch": + pi = 1 + elif p == "owl:differentFrom": + pi = 3 + elif p == "dbpedia-owl:different": + pi = 3 + + if pi == 0: + g.add_edge(o, s) + elif pi == 1: + g.add_edge(s, o) + elif pi == 2: + g.add_edge(s, o) + g.add_edge(o, s) return g @@ -65,7 +68,6 @@ def split_into_cliques(msdf: MappingSetDataFrame): comp_id = 0 newdocs = [] for comp in sorted(gen, key=len, reverse=True): - comp: collections.Iterable for n in comp: node_to_comp[n] = comp_id comp_id += 1 @@ -75,15 +77,21 @@ def split_into_cliques(msdf: MappingSetDataFrame): ) ) + if not isinstance(doc.mapping_set.mappings, list): + raise TypeError for m in doc.mapping_set.mappings: + if not isinstance(m, Mapping): + raise TypeError comp_id = node_to_comp[m.subject_id] subdoc = newdocs[comp_id] + if not isinstance(subdoc.mapping_set.mappings, list): + raise TypeError subdoc.mapping_set.mappings.append(m) return newdocs -def invert_dict(d: dict) -> dict: - invdict = {} +def invert_dict(d: Dict[str, str]) -> Dict[str, str]: + invdict: Dict[str, Any] = {} for k, v in d.items(): if v not in invdict: invdict[v] = [] diff --git a/sssom/context.py b/sssom/context.py index b3391be7..737ec497 100644 --- a/sssom/context.py +++ b/sssom/context.py @@ -1,13 +1,16 @@ import json import logging -from typing import Any, Mapping, Tuple +from typing import Optional from .external_context import sssom_external_context from .internal_context import sssom_context +from .typehints import Metadata, MetadataType, PrefixMap # HERE = pathlib.Path(__file__).parent.resolve() # DEFAULT_CONTEXT_PATH = HERE / "sssom.context.jsonld" # EXTERNAL_CONTEXT_PATH = HERE / "sssom.external.context.jsonld" + + SSSOM_BUILT_IN_PREFIXES = ["sssom", "owl", "rdf", "rdfs", "skos"] @@ -19,7 +22,7 @@ def get_external_jsonld_context(): return json.loads(sssom_external_context, strict=False) -def get_built_in_prefix_map(): +def get_built_in_prefix_map() -> PrefixMap: contxt = get_jsonld_context() curie_map = {} for key in contxt["@context"]: @@ -30,7 +33,9 @@ def get_built_in_prefix_map(): return curie_map -def add_built_in_prefixes_to_prefix_map(prefixmap): +def add_built_in_prefixes_to_prefix_map( + prefixmap: Optional[PrefixMap] = None, +) -> PrefixMap: builtinmap = get_built_in_prefix_map() if not prefixmap: prefixmap = builtinmap @@ -45,27 +50,27 @@ def add_built_in_prefixes_to_prefix_map(prefixmap): return prefixmap -def get_default_metadata() -> Tuple[Mapping[str, Any], Mapping[str, Any]]: +def get_default_metadata() -> Metadata: contxt = get_jsonld_context() contxt_external = get_external_jsonld_context() - curie_map = {} - meta = {} + prefix_map = {} + metadata: MetadataType = {} for key in contxt["@context"]: v = contxt["@context"][key] if isinstance(v, str): - curie_map[key] = v + prefix_map[key] = v elif isinstance(v, dict): if "@id" in v and "@prefix" in v: if v["@prefix"]: - curie_map[key] = v["@id"] + prefix_map[key] = v["@id"] for key in contxt_external["@context"]: v = contxt_external["@context"][key] if isinstance(v, str): - if key not in curie_map: - curie_map[key] = v + if key not in prefix_map: + prefix_map[key] = v else: - if curie_map[key] != v: + if prefix_map[key] != v: logging.warning( - f"{key} is already in curie map ({curie_map[key]}, but with a different value than {v}" + f"{key} is already in curie map ({prefix_map[key]}, but with a different value than {v}" ) - return meta, curie_map + return Metadata(prefix_map=prefix_map, metadata=metadata) diff --git a/sssom/io.py b/sssom/io.py index 8215d25d..7ade31d7 100644 --- a/sssom/io.py +++ b/sssom/io.py @@ -3,6 +3,7 @@ from .context import get_default_metadata from .parsers import get_parsing_function, read_sssom_table, split_dataframe +from .typehints import Metadata from .util import raise_for_bad_path, read_metadata from .writers import get_writer_function, write_table, write_tables @@ -24,7 +25,8 @@ def convert_file( write_func, fileformat = get_writer_function( output_format=output_format, output=output ) - write_func(doc, output, serialisation=fileformat) + # TODO cthoyt figure out how to use protocols for this + write_func(doc, output, serialisation=fileformat) # type:ignore def parse_file( @@ -48,11 +50,13 @@ def parse_file( clean_prefixes: If True (default), records with unknown prefixes are removed from the SSSOM file. """ raise_for_bad_path(input_path) - meta, curie_map = get_metadata_and_curie_map( + metadata = get_metadata_and_curie_map( metadata_path=metadata_path, curie_map_mode=curie_map_mode ) parse_func = get_parsing_function(input_format, input_path) - doc = parse_func(input_path, curie_map=curie_map, meta=meta) + doc = parse_func( + input_path, curie_map=metadata.prefix_map, meta=metadata.prefix_map + ) if clean_prefixes: # We do this because we got a lot of prefixes from the default SSSOM prefixes! doc.clean_prefix_map() @@ -92,8 +96,8 @@ def split_file(input_path: str, output_directory: str) -> None: def get_metadata_and_curie_map( - metadata_path: Optional[str] = None, curie_map_mode: str = "metadata_only" -): + metadata_path: Optional[str] = None, curie_map_mode: Optional[str] = None +) -> Metadata: """ Load SSSOM metadata from a file, and then augments it with default prefixes. @@ -103,15 +107,17 @@ def get_metadata_and_curie_map( """ if metadata_path is None: return get_default_metadata() - - meta, curie_map = read_metadata(metadata_path) + if curie_map_mode is None: + curie_map_mode = "metadata_only" + prefix_map, metadata = read_metadata(metadata_path) + # TODO reduce complexity by flipping conditionals + # and returning eagerly (it's fine if there are multiple returns) if curie_map_mode != "metadata_only": meta_sssom, curie_map_sssom = get_default_metadata() if curie_map_mode == "sssom_default_only": - curie_map = curie_map_sssom + prefix_map = curie_map_sssom elif curie_map_mode == "merged": for prefix, uri_prefix in curie_map_sssom.items(): - if prefix not in curie_map: - curie_map[prefix] = uri_prefix - - return meta, curie_map + if prefix not in prefix_map: + prefix_map[prefix] = uri_prefix + return Metadata(prefix_map=prefix_map, metadata=metadata) diff --git a/sssom/parsers.py b/sssom/parsers.py index 80af85c7..82147bfd 100644 --- a/sssom/parsers.py +++ b/sssom/parsers.py @@ -2,7 +2,8 @@ import logging import re import typing -from typing import Any, Dict, Optional, Set, TextIO, Union +from collections import Counter +from typing import Any, Dict, List, Optional, Set, TextIO, Union, cast from urllib.request import urlopen from xml.dom import Node, minidom from xml.dom.minidom import Document @@ -17,6 +18,7 @@ from .context import add_built_in_prefixes_to_prefix_map, get_default_metadata from .sssom_datamodel import Mapping, MappingSet from .sssom_document import MappingSetDocument +from .typehints import Metadata, MetadataType, PrefixMap from .util import ( SSSOM_DEFAULT_RDF_SERIALISATION, URI_SSSOM_MAPPINGS, @@ -33,7 +35,9 @@ def read_sssom_table( - file_path: str, curie_map: Dict[str, str] = None, meta: Dict[str, str] = None + file_path: str, + curie_map: Optional[PrefixMap] = None, + meta: Optional[MetadataType] = None, ) -> MappingSetDataFrame: """ parses a TSV -> MappingSetDocument -> MappingSetDataFrame @@ -76,14 +80,14 @@ def read_sssom_rdf( parses a TSV -> MappingSetDocument -> MappingSetDataFrame """ raise_for_bad_path(file_path) - curie_map, meta = _get_curie_map_and_metadata(curie_map=curie_map, meta=meta) + metadata = _get_curie_map_and_metadata(curie_map=curie_map, meta=meta) g = Graph() g.load(file_path, format=serialisation) # json_obj = json.loads(g.serialize(format="json-ld")) # print(json_obj) # msdf = from_sssom_json(json_obj, curie_map=curie_map, meta=meta) - msdf = from_sssom_rdf(g, curie_map=curie_map, meta=meta) + msdf = from_sssom_rdf(g, curie_map=metadata.prefix_map, meta=metadata.metadata) return msdf @@ -94,11 +98,13 @@ def read_sssom_json( parses a TSV -> MappingSetDocument -> MappingSetDataFrame """ raise_for_bad_path(file_path) - curie_map, meta = _get_curie_map_and_metadata(curie_map=curie_map, meta=meta) + metadata = _get_curie_map_and_metadata(curie_map=curie_map, meta=meta) with open(file_path) as json_file: jsondoc = json.load(json_file) - msdf = from_sssom_json(jsondoc=jsondoc, curie_map=curie_map, meta=meta) + msdf = from_sssom_json( + jsondoc=jsondoc, curie_map=metadata.prefix_map, meta=metadata.metadata + ) return msdf @@ -117,48 +123,54 @@ def read_obographs_json( """ raise_for_bad_path(file_path) - curie_map, meta = _get_curie_map_and_metadata(curie_map=curie_map, meta=meta) + _xmetadata = _get_curie_map_and_metadata(curie_map=curie_map, meta=meta) with open(file_path) as json_file: jsondoc = json.load(json_file) - return from_obographs(jsondoc, curie_map, meta) + return from_obographs( + jsondoc, curie_map=_xmetadata.prefix_map, meta=_xmetadata.metadata + ) -def _get_curie_map_and_metadata(curie_map: Dict, meta: Dict): - default_meta, default_curie_map = get_default_metadata() +def _get_curie_map_and_metadata( + curie_map: Optional[PrefixMap] = None, meta: Optional[MetadataType] = None +) -> Metadata: + default_metadata = get_default_metadata() - if not curie_map: + if curie_map is None: logging.warning( "No curie map provided (not recommended), trying to use defaults.." ) - curie_map = default_curie_map + curie_map = default_metadata.prefix_map - if not meta: - meta = default_meta + if meta is None: + meta = default_metadata.metadata else: if curie_map and "curie_map" in meta: logging.info( - "Curie map prvoided as parameter, but SSSOM file provides its own CURIE map. " + "Curie map provided as parameter, but SSSOM file provides its own CURIE map. " "CURIE map provided externally is disregarded in favour of the curie map in the SSSOM file." ) - curie_map = meta["curie_map"] + curie_map = cast(PrefixMap, meta["curie_map"]) - return curie_map, meta + return Metadata(prefix_map=curie_map, metadata=meta) def read_alignment_xml( - file_path: str, curie_map: Dict[str, str] = None, meta: Dict[str, str] = None + file_path: str, curie_map: Dict[str, str], meta: Dict[str, str] ) -> MappingSetDataFrame: """ parses a TSV -> MappingSetDocument -> MappingSetDataFrame """ raise_for_bad_path(file_path) - curie_map, meta = _get_curie_map_and_metadata(curie_map=curie_map, meta=meta) + metadata = _get_curie_map_and_metadata(curie_map=curie_map, meta=meta) logging.info("Loading from alignment API") xmldoc = minidom.parse(file_path) - msdf = from_alignment_minidom(xmldoc, curie_map, meta) + msdf = from_alignment_minidom( + xmldoc, curie_map=metadata.prefix_map, meta=metadata.metadata + ) return msdf @@ -166,7 +178,10 @@ def read_alignment_xml( def from_sssom_dataframe( - df: pd.DataFrame, curie_map: Dict[str, str], meta: Dict[str, str] + df: pd.DataFrame, + *, + curie_map: Optional[PrefixMap] = None, + meta: Optional[MetadataType] = None, ) -> MappingSetDataFrame: """ Converts a dataframe to a MappingSetDataFrame @@ -175,15 +190,14 @@ def from_sssom_dataframe( :param meta: :return: MappingSetDataFrame """ - - _check_curie_map(curie_map) + curie_map = _ensure_prefix_map(curie_map) if "confidence" in df.columns: df["confidence"].replace(r"^\s*$", np.NaN, regex=True, inplace=True) - mlist = [] + mlist: List[Mapping] = [] ms = MappingSet() - bad_attrs = {} + bad_attrs: typing.Counter[str] = Counter() for _, row in df.iterrows(): mdict = {} for k, v in row.items(): @@ -199,16 +213,14 @@ def from_sssom_dataframe( ms[k] = v ok = True if not ok: - if k not in bad_attrs: - bad_attrs[k] = 1 - else: - bad_attrs[k] += 1 - m = _prepare_mapping(Mapping(**mdict)) + bad_attrs[k] += 1 + mlist.append(_prepare_mapping(Mapping(**mdict))) - mlist.append(m) - for k, v in bad_attrs.items(): + for k, v in bad_attrs.most_common(): logging.warning(f"No attr for {k} [{v} instances]") - ms.mappings = mlist + # the autogenerated code's type annotations are _really_ messy. This is in fact okay, + # so with a heavy heart we employ type:ignore + ms.mappings = mlist # type:ignore _set_metadata_in_mapping_set(mapping_set=ms, metadata=meta) doc = MappingSetDocument(mapping_set=ms, curie_map=curie_map) return to_mapping_set_dataframe(doc) @@ -216,14 +228,14 @@ def from_sssom_dataframe( def from_sssom_rdf( g: Graph, - curie_map: Dict[str, str] = None, - meta: Dict[str, str] = None, - mapping_predicates: Set[str] = None, + curie_map: Optional[PrefixMap] = None, + meta: Optional[MetadataType] = None, + mapping_predicates: Optional[Set[str]] = None, ) -> MappingSetDataFrame: """ Converts an SSSOM RDF graph into a SSSOM data table Args: - g: the Grah (rdflib) + g: the Graph (rdflib) curie_map: A dictionary conatining the prefix map meta: Potentially additional metadata mapping_predicates: A set of predicates that should be extracted from the RDF graph @@ -231,14 +243,14 @@ def from_sssom_rdf( Returns: """ - curie_map = _check_curie_map(curie_map) + curie_map = _ensure_prefix_map(curie_map) if mapping_predicates is None: # FIXME unused mapping_predicates = _get_default_mapping_predicates() ms = MappingSet() - mlist = [] + mlist: List[Mapping] = [] for sx, px, ox in g.triples((None, URIRef(URI_SSSOM_MAPPINGS), None)): mdict = {} @@ -283,28 +295,30 @@ def from_sssom_rdf( f"This usually happens when a critical curie_map entry is missing." ) - ms.mappings = mlist + ms.mappings = mlist # type: ignore _set_metadata_in_mapping_set(mapping_set=ms, metadata=meta) mdoc = MappingSetDocument(mapping_set=ms, curie_map=curie_map) return to_mapping_set_dataframe(mdoc) def from_sssom_json( - jsondoc: Union[str, dict, TextIO], curie_map: Dict, meta: Dict[str, str] = None + jsondoc: Union[str, dict, TextIO], + *, + curie_map: Dict[str, str], + meta: Dict[str, str] = None, ) -> MappingSetDataFrame: - _check_curie_map(curie_map) + _ensure_prefix_map(curie_map) # noinspection PyTypeChecker ms = JSONLoader().load(source=jsondoc, target_class=MappingSet) _set_metadata_in_mapping_set(ms, metadata=meta) - ms: MappingSet mdoc = MappingSetDocument(mapping_set=ms, curie_map=curie_map) return to_mapping_set_dataframe(mdoc) def from_alignment_minidom( - dom: Document, curie_map: Dict[str, str] = None, meta: Dict[str, str] = None + dom: Document, *, curie_map: PrefixMap, meta: MetadataType ) -> MappingSetDataFrame: """ Reads a minidom Document object @@ -313,10 +327,11 @@ def from_alignment_minidom( :param meta: Optional meta data :return: MappingSetDocument """ - _check_curie_map(curie_map) + # FIXME: should be curie_map = _check_curie_map(curie_map) + _ensure_prefix_map(curie_map) ms = MappingSet() - mlist = [] + mlist: List[Mapping] = [] # bad_attrs = {} alignments = dom.getElementsByTagName("Alignment") @@ -351,14 +366,14 @@ def from_alignment_minidom( elif node_name == "uri2": ms["object_source"] = e.firstChild.nodeValue - ms.mappings = mlist + ms.mappings = mlist # type: ignore _set_metadata_in_mapping_set(mapping_set=ms, metadata=meta) mdoc = MappingSetDocument(mapping_set=ms, curie_map=curie_map) return to_mapping_set_dataframe(mdoc) def from_obographs( - jsondoc: Dict, curie_map: Dict[str, str], meta: Dict[str, str] = None + jsondoc: Dict, *, curie_map: PrefixMap, meta: Optional[MetadataType] = None ) -> MappingSetDataFrame: """ Converts a obographs json object to an SSSOM data frame @@ -372,10 +387,10 @@ def from_obographs( An SSSOM data frame (MappingSetDataFrame) """ - _check_curie_map(curie_map) + _ensure_prefix_map(curie_map) ms = MappingSet() - mlist = [] + mlist: List[Mapping] = [] # bad_attrs = {} allowed_properties = [ @@ -436,7 +451,7 @@ def from_obographs( else: raise Exception("No graphs element in obographs file, wrong format?") - ms.mappings = mlist + ms.mappings = mlist # type: ignore _set_metadata_in_mapping_set(mapping_set=ms, metadata=meta) mdoc = MappingSetDocument(mapping_set=ms, curie_map=curie_map) return to_mapping_set_dataframe(mdoc) @@ -463,7 +478,7 @@ def get_parsing_function(input_format, filename): raise Exception(f"Unknown input format: {input_format}") -def _check_curie_map(curie_map): +def _ensure_prefix_map(curie_map: Optional[PrefixMap] = None) -> PrefixMap: if not curie_map: raise Exception("No valid curie_map provided") else: @@ -508,9 +523,9 @@ def _swap_object_subject(mapping: Mapping) -> Mapping: return mapping -def _read_metadata_from_table(filename: str) -> typing.Mapping[str, Any]: - if validators.url(filename): - response = urlopen(filename) +def _read_metadata_from_table(path: str) -> Dict[str, Any]: + if validators.url(path): + response = urlopen(path) yamlstr = "" for lin in response: line = lin.decode("utf-8") @@ -519,9 +534,9 @@ def _read_metadata_from_table(filename: str) -> typing.Mapping[str, Any]: else: break else: - with open(filename, "r") as s: + with open(path) as file: yamlstr = "" - for line in s: + for line in file: if line.startswith("#"): yamlstr += re.sub("^#", "", line) else: @@ -537,8 +552,10 @@ def _is_valid_mapping(m: Mapping) -> bool: return bool(m.predicate_id and m.object_id and m.subject_id) -def _set_metadata_in_mapping_set(mapping_set: MappingSet, metadata: dict) -> None: - if not metadata: +def _set_metadata_in_mapping_set( + mapping_set: MappingSet, metadata: Optional[MetadataType] = None +) -> None: + if metadata is None: logging.info("Tried setting metadata but none provided.") else: for k, v in metadata.items(): @@ -546,7 +563,7 @@ def _set_metadata_in_mapping_set(mapping_set: MappingSet, metadata: dict) -> Non mapping_set[k] = v -def _cell_element_values(cell_node, curie_map: dict) -> Optional[Mapping]: +def _cell_element_values(cell_node, curie_map: PrefixMap) -> Optional[Mapping]: mdict = {} for child in cell_node.childNodes: if child.nodeType == Node.ELEMENT_NODE: @@ -577,6 +594,8 @@ def _cell_element_values(cell_node, curie_map: dict) -> Optional[Mapping]: m = Mapping(**mdict) if _is_valid_mapping(m): return m + else: + return None # The following methods dont really belong in the parser package.. @@ -587,44 +606,47 @@ def to_mapping_set_document(msdf: MappingSetDataFrame) -> MappingSetDocument: if not msdf.prefixmap: raise Exception("No valid curie_map provided") - mlist = [] + mlist: List[Mapping] = [] ms = MappingSet() bad_attrs = {} - for _, row in msdf.df.iterrows(): - mdict = {} - for k, v in row.items(): - ok = False - if k: - k = str(k) - if hasattr(Mapping, k): - mdict[k] = v - ok = True - if hasattr(MappingSet, k): - ms[k] = v - ok = True - if not ok: - if k not in bad_attrs: - bad_attrs[k] = 1 - else: - bad_attrs[k] += 1 - m = _prepare_mapping(Mapping(**mdict)) - mlist.append(m) + if msdf.df is not None: + for _, row in msdf.df.iterrows(): + mdict = {} + for k, v in row.items(): + ok = False + if k: + k = str(k) + if hasattr(Mapping, k): + mdict[k] = v + ok = True + if hasattr(MappingSet, k): + ms[k] = v + ok = True + if not ok: + if k not in bad_attrs: + bad_attrs[k] = 1 + else: + bad_attrs[k] += 1 + m = _prepare_mapping(Mapping(**mdict)) + mlist.append(m) for k, v in bad_attrs.items(): logging.warning(f"No attr for {k} [{v} instances]") - ms.mappings = mlist - for k, v in msdf.metadata.items(): - if k != "curie_map": - ms[k] = v + ms.mappings = mlist # type: ignore + if msdf.metadata is not None: + for k, v in msdf.metadata.items(): + if k != "curie_map": + ms[k] = v return MappingSetDocument(mapping_set=ms, curie_map=msdf.prefixmap) def split_dataframe( msdf: MappingSetDataFrame, ) -> typing.Mapping[str, MappingSetDataFrame]: - df = msdf.df - subject_prefixes = set(df["subject_id"].str.split(":", 1, expand=True)[0]) - object_prefixes = set(df["object_id"].str.split(":", 1, expand=True)[0]) - relations = set(df["predicate_id"]) + if msdf.df is None: + raise RuntimeError + subject_prefixes = set(msdf.df["subject_id"].str.split(":", 1, expand=True)[0]) + object_prefixes = set(msdf.df["object_id"].str.split(":", 1, expand=True)[0]) + relations = set(msdf.df["predicate_id"]) return split_dataframe_by_prefix( msdf=msdf, subject_prefixes=subject_prefixes, @@ -654,12 +676,12 @@ def split_dataframe_by_prefix( relpre = rel.split(":")[0] relppost = rel.split(":")[1] split_name = f"{pre_subj.lower()}_{relppost.lower()}_{pre_obj.lower()}" - - dfs = df[ - (df["subject_id"].str.startswith(pre_subj + ":")) - & (df["predicate_id"] == rel) - & (df["object_id"].str.startswith(pre_obj + ":")) - ] + if df is not None: + dfs = df[ + (df["subject_id"].str.startswith(pre_subj + ":")) + & (df["predicate_id"] == rel) + & (df["object_id"].str.startswith(pre_obj + ":")) + ] if pre_subj in curie_map and pre_obj in curie_map and len(dfs) > 0: cm = { pre_subj: curie_map[pre_subj], diff --git a/sssom/rdf_util.py b/sssom/rdf_util.py index 599def70..031fcf39 100644 --- a/sssom/rdf_util.py +++ b/sssom/rdf_util.py @@ -1,38 +1,42 @@ import logging -from typing import List +from typing import Any, Dict, List, Optional from rdflib import Graph, URIRef -from rdflib.plugins.memory import Any from .parsers import to_mapping_set_document -from .sssom_datamodel import Mapping +from .sssom_datamodel import EntityId, Mapping from .util import MappingSetDataFrame def rewire_graph( g: Graph, mset: MappingSetDataFrame, - subject_to_object=True, - precedence: List[str] = None, -) -> str: + subject_to_object: bool = True, + precedence: Optional[List[str]] = None, +) -> int: """ rewires an RDF Graph replacing using equivalence mappings """ pm = mset.prefixmap mdoc = to_mapping_set_document(mset) - rewire_map = {} + rewire_map: Dict[EntityId, EntityId] = {} - def expand_curie(curie: str): + def expand_curie(curie: str) -> URIRef: pfx, local = curie.split(":") return URIRef(f"{pm[pfx]}{local}") + if mdoc.mapping_set.mappings is None: + raise TypeError for m in mdoc.mapping_set.mappings: - m: Mapping + if not isinstance(m, Mapping): + continue if m.predicate_id in {"owl:equivalentClass", "owl:equivalentProperty"}: if subject_to_object: src, tgt = m.subject_id, m.object_id else: src, tgt = m.object_id, m.subject_id + if not isinstance(src, EntityId) or not isinstance(tgt, EntityId): + raise TypeError if src in rewire_map: curr_tgt = rewire_map[src] logging.info(f"Ambiguous: {src} -> {tgt} vs {curr_tgt}") @@ -49,12 +53,15 @@ def expand_curie(curie: str): raise ValueError(f"Ambiguous: {src} -> {tgt} vs {curr_tgt}") else: rewire_map[src] = tgt - rewire_map = {expand_curie(k): expand_curie(v) for k, v in rewire_map.items()} + + uri_ref_rewire_map: Dict[URIRef, URIRef] = { + expand_curie(k): expand_curie(v) for k, v in rewire_map.items() + } def rewire_node(n: Any): if isinstance(n, URIRef): - if n in rewire_map: - return rewire_map[n] + if n in uri_ref_rewire_map: + return uri_ref_rewire_map[n] else: return n else: diff --git a/sssom/sparql_util.py b/sssom/sparql_util.py index 4f954ad3..f83124a9 100644 --- a/sssom/sparql_util.py +++ b/sssom/sparql_util.py @@ -1,6 +1,6 @@ import logging from dataclasses import dataclass -from typing import Dict, List, Optional +from typing import Dict, List, Mapping, Optional import pandas as pd from rdflib import URIRef @@ -12,12 +12,12 @@ @dataclass class EndpointConfig: - url: str = None - graph: URIRef = None - predmap: Dict[str, str] = None - predicates: Optional[List[str]] = None - limit: Optional[int] = None - curie_map: Optional[Dict[str, str]] = None + url: str + graph: URIRef + predmap: Dict[str, str] + predicates: Optional[List[str]] + limit: Optional[int] + curie_map: Optional[Dict[str, str]] include_object_labels: bool = False @@ -28,20 +28,21 @@ def query_mappings(config: EndpointConfig) -> MappingSetDataFrame: sparql = SPARQLWrapper(config.url) if config.graph is None: g = "?g" + elif isinstance(config.graph, str): + g = URIRef(config.graph).n3() else: - g = config.graph - if isinstance(g, str): - g = URIRef(g) - g = g.n3() - preds = config.predicates - if preds is None: - preds = {SKOS.exactMatch, SKOS.closeMatch} + g = config.graph.n3() + if config.predicates is None: + predicates = [SKOS.exactMatch, SKOS.closeMatch] else: - preds = [expand_curie(p, config) for p in preds] - predstr = " ".join([p.n3() for p in preds]) - limitstr = "" + predicates = [ + expand_curie(predicate, config) for predicate in config.predicates + ] + predstr = " ".join(URIRef(predicate).n3() for predicate in predicates) if config.limit is not None: limitstr = f"LIMIT {config.limit}" + else: + limitstr = "" cols = [ "subject_id", "subject_label", @@ -57,7 +58,7 @@ def query_mappings(config: EndpointConfig) -> MappingSetDataFrame: if config.include_object_labels else "" ) - q = f""" + q = f"""\ PREFIX rdfs: {RDFS.uri.n3()} SELECT {colstr} WHERE {{ @@ -79,14 +80,13 @@ def query_mappings(config: EndpointConfig) -> MappingSetDataFrame: row = {k: v["value"] for k, v in result.items()} rows.append(curiefy_row(row, config)) df = pd.DataFrame(rows) + if config.curie_map is None: + raise TypeError return MappingSetDataFrame(df=df, prefixmap=config.curie_map) -def curiefy_row(row: Dict[str, str], config: EndpointConfig) -> Dict[str, str]: - new_row = {} - for k, v in row.items(): - new_row[k] = contract_uri(v, config) - return new_row +def curiefy_row(row: Mapping[str, str], config: EndpointConfig) -> Dict[str, str]: + return {k: contract_uri(v, config) for k, v in row.items()} def contract_uri(uristr: str, config: EndpointConfig) -> str: diff --git a/sssom/sssom_document.py b/sssom/sssom_document.py index 71fbdd7f..52b045bd 100644 --- a/sssom/sssom_document.py +++ b/sssom/sssom_document.py @@ -1,7 +1,7 @@ from dataclasses import dataclass -from typing import Dict from .sssom_datamodel import MappingSet +from .typehints import PrefixMap @dataclass() @@ -17,7 +17,7 @@ class MappingSetDocument: The main part of the document: a set of mappings plus metadata """ - curie_map: Dict[str, str] + curie_map: PrefixMap """ Mappings between ID prefixes and URI Bases, used to map CURIEs to URIs. Note that the CURIE map is not part of the core SSSOM model, hence it belongs here in the document diff --git a/sssom/typehints.py b/sssom/typehints.py new file mode 100644 index 00000000..d3eda828 --- /dev/null +++ b/sssom/typehints.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +"""Type hints for SSSOM.""" + +from typing import Any, Dict, NamedTuple + +__all__ = [ + "PrefixMap", + "MetadataType", + "Metadata", +] + +PrefixMap = Dict[str, str] + +#: TODO replace this with something more specific +MetadataType = Dict[str, Any] + + +class Metadata(NamedTuple): + prefix_map: PrefixMap + metadata: MetadataType diff --git a/sssom/util.py b/sssom/util.py index d95a23ae..882c3eef 100644 --- a/sssom/util.py +++ b/sssom/util.py @@ -3,9 +3,21 @@ import logging import os import re -from dataclasses import dataclass +from collections import defaultdict +from dataclasses import dataclass, field from io import FileIO, StringIO -from typing import Any, Dict, List, Mapping, Optional, Set, TextIO, Union +from typing import ( + Any, + DefaultDict, + Dict, + List, + Mapping, + Optional, + Set, + TextIO, + Tuple, + Union, +) from urllib.request import urlopen import numpy as np @@ -16,6 +28,7 @@ from .context import get_default_metadata, get_jsonld_context from .sssom_datamodel import Entity, slots from .sssom_document import MappingSetDocument +from .typehints import Metadata, MetadataType, PrefixMap SSSOM_READ_FORMATS = [ "tsv", @@ -61,17 +74,21 @@ class MappingSetDataFrame: A collection of mappings represented as a DataFrame, together with additional metadata """ - df: pd.DataFrame = None # Mappings - prefixmap: Dict[str, str] = None # maps CURIE prefixes to URI bases - metadata: Optional[Dict[str, str]] = None # header metadata excluding prefixes + df: Optional[pd.DataFrame] = None # Mappings + #: maps CURIE prefixes to URI bases + prefixmap: PrefixMap = field(default_factory=dict) + metadata: Optional[MetadataType] = None # header metadata excluding prefixes - def merge(self, msdf2, inplace=True): + def merge( + self, msdf2: "MappingSetDataFrame", inplace: bool = True + ) -> "MappingSetDataFrame": """Merges two MappingSetDataframes Args: - msdf2 (MappingSetDataFrame): Secondary MappingSetDataFrame (self => primary) - inplace (bool): if true, msdf2 is merged into the calling MappingSetDataFrame, if false, it simply return - the merged data frame. + msdf: Secondary MappingSetDataFrame (self => primary) + inplace: + if true, msdf2 is merged into the calling MappingSetDataFrame, if false, it simply return + the merged data frame. Returns: MappingSetDataFrame: Merged MappingSetDataFrame @@ -81,6 +98,7 @@ def merge(self, msdf2, inplace=True): self.df = msdf.df self.prefixmap = msdf.prefixmap self.metadata = msdf.metadata + # FIXME should return self if inplace return msdf def __str__(self): @@ -94,9 +112,9 @@ def __str__(self): description += self.df.tail().to_string() + "\n" return description - def clean_prefix_map(self): + def clean_prefix_map(self) -> None: prefixes_in_map = get_prefixes_used_in_table(self.df) - new_prefixes = dict() + new_prefixes: PrefixMap = dict() missing_prefix = [] for prefix in prefixes_in_map: if prefix in self.prefixmap: @@ -140,9 +158,9 @@ class MappingSetDiff: this is considered a mapping in common. """ - unique_tuples1: Optional[Set[str]] = None - unique_tuples2: Optional[Set[str]] = None - common_tuples: Optional[Set[str]] = None + unique_tuples1: Optional[Set[EntityPair]] = None + unique_tuples2: Optional[Set[EntityPair]] = None + common_tuples: Optional[Set[EntityPair]] = None combined_dataframe: Optional[pd.DataFrame] = None """ @@ -169,6 +187,8 @@ def load(self, filename) -> None: self.df = read_pandas(filename) def convert(self) -> Dict[str, Any]: + if self.df is None: + raise RuntimeError("dataframe is not loaded properly") # note that 'mapping' is both a metaproperty and a property of this model... cslots = { "mappings": { @@ -178,7 +198,7 @@ def convert(self) -> Dict[str, Any]: }, "id": {"description": "CURIE or IRI identifier", "identifier": True}, } - classes = { + classes: Dict[str, Any] = { "mapping set": { "description": "Represents a set of mappings", "slots": ["mappings"], @@ -252,7 +272,7 @@ def convert_and_save(self, fn: str) -> None: yaml.safe_dump(obj, stream, sort_keys=False) -def parse(filename) -> pd.DataFrame: +def parse(filename: str) -> pd.DataFrame: """ parses a TSV to a pandas frame """ @@ -262,7 +282,7 @@ def parse(filename) -> pd.DataFrame: # return read_pandas(filename) -def collapse(df): +def collapse(df: pd.DataFrame) -> pd.DataFrame: """ collapses rows with same S/P/O and combines confidence """ @@ -274,7 +294,7 @@ def collapse(df): return df2 -def sort_sssom_columns(columns: list) -> list: +def sort_sssom_columns(columns: List[str]) -> List[str]: # Ideally, the order of the sssom column names is parsed strictly from sssom.yaml logging.warning("SSSOM sort columns not implemented") @@ -289,7 +309,9 @@ def sort_sssom(df: pd.DataFrame) -> pd.DataFrame: return df -def filter_redundant_rows(df: pd.DataFrame, ignore_predicate=False) -> pd.DataFrame: +def filter_redundant_rows( + df: pd.DataFrame, ignore_predicate: bool = False +) -> pd.DataFrame: """ removes rows if there is another row with same S/O and higher confidence @@ -308,7 +330,7 @@ def filter_redundant_rows(df: pd.DataFrame, ignore_predicate=False) -> pd.DataFr key = [SUBJECT_ID, OBJECT_ID, PREDICATE_ID] dfmax: pd.DataFrame dfmax = df.groupby(key, as_index=False)[CONFIDENCE].apply(max).drop_duplicates() - max_conf = {} + max_conf: Dict[Tuple[str, ...], float] = {} for _, row in dfmax.iterrows(): if ignore_predicate: max_conf[(row[SUBJECT_ID], row[OBJECT_ID])] = row[CONFIDENCE] @@ -337,7 +359,7 @@ def filter_redundant_rows(df: pd.DataFrame, ignore_predicate=False) -> pd.DataFr return return_df -def assign_default_confidence(df: pd.DataFrame): +def assign_default_confidence(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: # Get rows having numpy.NaN as confidence if df is not None and "confidence" not in df.columns: df["confidence"] = np.NaN @@ -357,7 +379,7 @@ def remove_unmatched(df: pd.DataFrame) -> pd.DataFrame: return df[df[PREDICATE_ID] != "noMatch"] -def create_entity(row, eid: str, mappings: Dict) -> Entity: +def create_entity(row, eid: str, mappings: Dict[str, Any]) -> Entity: logging.warning(f"create_entity() has row parameter ({row}), but not used.") e = Entity(id=eid) for k, v in mappings.items(): @@ -366,37 +388,32 @@ def create_entity(row, eid: str, mappings: Dict) -> Entity: return e -def group_mappings(df: pd.DataFrame) -> Dict[EntityPair, List]: +def group_mappings(df: pd.DataFrame) -> Dict[EntityPair, List[pd.Series]]: """ group mappings by EntityPairs """ - mappings: Dict = {} + mappings: DefaultDict[EntityPair, List[pd.Series]] = defaultdict(list) for _, row in df.iterrows(): - sid = row[SUBJECT_ID] - oid = row[OBJECT_ID] - s = create_entity( + subject_entity = create_entity( row, - sid, + row[SUBJECT_ID], { "label": SUBJECT_LABEL, "category": SUBJECT_CATEGORY, "source": SUBJECT_SOURCE, }, ) - o = create_entity( + object_entity = create_entity( row, - oid, + row[OBJECT_ID], { "label": OBJECT_LABEL, "category": OBJECT_CATEGORY, "source": OBJECT_SOURCE, }, ) - pair = EntityPair(s, o) - if pair not in mappings: - mappings[pair] = [] - mappings[pair].append(row) - return mappings + mappings[EntityPair(subject_entity, object_entity)].append(row) + return dict(mappings) def compare_dataframes(df1: pd.DataFrame, df2: pd.DataFrame) -> MappingSetDiff: @@ -569,13 +586,15 @@ def merge_msdf( merged_msdf = MappingSetDataFrame() # If msdf2 has a DataFrame - if msdf2.df is not None: + if msdf1.df is not None and msdf2.df is not None: # 'outer' join in pandas == FULL JOIN in SQL merged_msdf.df = msdf1.df.merge(msdf2.df, how="outer") else: merged_msdf.df = msdf1.df # merge the non DataFrame elements - merged_msdf.prefixmap = dict_merge(msdf2.prefixmap, msdf1.prefixmap, "prefixmap") + merged_msdf.prefixmap = dict_merge( + source=msdf2.prefixmap, target=msdf1.prefixmap, dict_name="prefixmap" + ) # After a Slack convo with @matentzn, commented out below. # merged_msdf.metadata = dict_merge(msdf2.metadata, msdf1.metadata, 'metadata') @@ -660,7 +679,6 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: )[CONFIDENCE].max() # If same confidence prefer "HumanCurated". - reconciled_df_subset: pd.DataFrame reconciled_df_subset = pd.DataFrame(columns=combined_normalized_subset.columns) for _, row_1 in max_confidence_df.iterrows(): match_condition_1 = ( @@ -668,7 +686,6 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: & (combined_normalized_subset[OBJECT_ID] == row_1[OBJECT_ID]) & (combined_normalized_subset[CONFIDENCE] == row_1[CONFIDENCE]) ) - match_condition_1: Union[bool, ...] # match_condition_1[match_condition_1] gives the list of 'True's. # In other words, the rows that match the condition (rules declared). # Ideally, there should be 1 row. If not apply an extra rule to look for 'HumanCurated'. @@ -698,12 +715,10 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: & (reconciled_df_subset[OBJECT_ID] == row_2[OBJECT_ID]) & (reconciled_df_subset[CONFIDENCE] == row_2[CONFIDENCE]) ) - match_condition_2: Union[bool, ...] reconciled_df_subset.loc[ match_condition_2[match_condition_2].index, PREDICATE_ID ] = row_2[PREDICATE_ID] - reconciled_df: pd.DataFrame reconciled_df = pd.DataFrame(columns=df.columns) for _, row_3 in reconciled_df_subset.iterrows(): match_condition_3 = ( @@ -712,7 +727,6 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: & (df[CONFIDENCE] == row_3[CONFIDENCE]) & (df[PREDICATE_ID] == row_3[PREDICATE_ID]) ) - match_condition_3: Union[bool, ...] reconciled_df = reconciled_df.append( df.loc[match_condition_3[match_condition_3].index, :] ) @@ -720,20 +734,24 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: return return_df -def dict_merge(source: Dict, target: Dict, dict_name: str) -> Dict: +def dict_merge( + *, + source: Optional[Dict[str, Any]] = None, + target: Dict[str, Any], + dict_name: str, +) -> Dict[str, Any]: """ Takes 2 MappingSetDataFrame elements (prefixmap OR metadata) and merges source => target Args: - source (Dict): MappingSetDataFrame.prefixmap / MappingSetDataFrame.metadata - target (Dict): MappingSetDataFrame.prefixmap / MappingSetDataFrame.metadata - dict_name (str): prefixmap or metadata + source: MappingSetDataFrame.prefixmap / MappingSetDataFrame.metadata + target: MappingSetDataFrame.prefixmap / MappingSetDataFrame.metadata + dict_name: prefixmap or metadata Returns: Dict: merged MappingSetDataFrame.prefixmap / MappingSetDataFrame.metadata """ if source is not None: - k: str for k, v in source.items(): if k not in target: if v not in list(target.values()): @@ -760,15 +778,18 @@ def inject_metadata_into_df(msdf: MappingSetDataFrame) -> MappingSetDataFrame: Returns: MappingSetDataFrame: MappingSetDataFrame with metadata as columns """ - if bool(msdf.metadata): + if msdf.metadata is not None and msdf.df is not None: for k, v in msdf.metadata.items(): if k not in msdf.df.columns: msdf.df[k] = v return msdf -def get_file_extension(file: TextIO) -> str: - filename = file.name +def get_file_extension(file: Union[str, TextIO]) -> str: + if isinstance(file, str): + filename = file + else: + filename = file.name parts = filename.split(".") if len(parts) > 0: f_format = parts[-1] @@ -793,7 +814,7 @@ def read_csv(filename, comment="#", sep=","): return pd.read_csv(StringIO(lines), sep=sep) -def read_metadata(filename): +def read_metadata(filename: str) -> Metadata: """ Read a metadata file (yaml) that is supplied separately from a TSV. @@ -811,35 +832,27 @@ def read_metadata(filename): meta = m except yaml.YAMLError as exc: print(exc) # FIXME this clobbers the exception. Remove try/except - return meta, curie_map + return Metadata(prefix_map=curie_map, metadata=meta) -def read_pandas(filename: str, sep: Optional[str] = "\t") -> pd.DataFrame: +def read_pandas(file: Union[str, TextIO], sep: Optional[str] = None) -> pd.DataFrame: """ Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly. - :param filename: + :param file: The file to read. If no separator is given, this file should be named. :param sep: File separator in pandas (\t or ,) - :return: + :return: A pandas dataframe """ - if not sep: - extension = get_file_extension(filename) - sep = "\t" + if sep is None: + extension = get_file_extension(file) if extension == "tsv": sep = "\t" elif extension == "csv": sep = "," else: + sep = "\t" logging.warning("Cannot automatically determine table format, trying tsv.") - - # from tempfile import NamedTemporaryFile - # with NamedTemporaryFile("r+") as tmp: - # with open(filename, "r") as f: - # for line in f: - # if not line.startswith('#'): - # tmp.write(line + "\n") - # tmp.seek(0) - return read_csv(filename, comment="#", sep=sep).fillna("") + return read_csv(file, comment="#", sep=sep).fillna("") def extract_global_metadata(msdoc: MappingSetDocument): @@ -862,13 +875,14 @@ def to_mapping_set_dataframe(doc: MappingSetDocument) -> MappingSetDataFrame: # convert MappingSetDocument into MappingSetDataFrame ### data = [] - for mapping in doc.mapping_set.mappings: - mdict = mapping.__dict__ - m = {} - for key in mdict: - if mdict[key]: - m[key] = mdict[key] - data.append(m) + if doc.mapping_set.mappings is not None: + for mapping in doc.mapping_set.mappings: + mdict = mapping.__dict__ + m = {} + for key in mdict: + if mdict[key]: + m[key] = mdict[key] + data.append(m) df = pd.DataFrame(data=data) meta = extract_global_metadata(doc) meta.pop("curie_map", None) @@ -932,7 +946,7 @@ def filter_out_prefixes(df: pd.DataFrame, filter_prefixes) -> pd.DataFrame: return pd.DataFrame(columns=KEY_FEATURES) -def guess_file_format(filename): +def guess_file_format(filename: Union[str, TextIO]) -> str: extension = get_file_extension(filename) if extension in ["owl", "rdf"]: return SSSOM_DEFAULT_RDF_SERIALISATION @@ -944,11 +958,10 @@ def guess_file_format(filename): ) -def prepare_context_from_curie_map(curie_map: dict): - meta, default_curie_map = get_default_metadata() +def prepare_context_from_curie_map(curie_map: Optional[PrefixMap] = None) -> str: context = get_jsonld_context() - if not curie_map: - curie_map = default_curie_map + if curie_map is None: + curie_map = get_default_metadata().prefix_map for k, v in curie_map.items(): if isinstance(v, str): diff --git a/sssom/writers.py b/sssom/writers.py index 18ed536c..cb0c5ded 100644 --- a/sssom/writers.py +++ b/sssom/writers.py @@ -1,7 +1,7 @@ import json import logging import os -from typing import Callable, Optional, TextIO, Tuple +from typing import Any, Callable, Dict, Optional, TextIO, Tuple import pandas as pd import yaml @@ -43,15 +43,16 @@ def write_table(msdf: MappingSetDataFrame, file: TextIO, serialisation="tsv") -> """ dataframe 2 tsv """ + if msdf.df is None: + raise TypeError sep = _get_separator(serialisation) # df = to_dataframe(msdf) + meta: Dict[str, Any] = {} if msdf.metadata is not None: - meta = {k: v for k, v in msdf.metadata.items()} - else: - meta = {} + meta.update(msdf.metadata) if msdf.prefixmap is not None: meta["curie_map"] = msdf.prefixmap @@ -125,9 +126,9 @@ def write_owl( def to_dataframe(msdf: MappingSetDataFrame) -> pd.DataFrame: data = [] - doc = to_mapping_set_document(msdf) - + if doc.mapping_set.mappings is None: + raise TypeError for mapping in doc.mapping_set.mappings: mdict = mapping.__dict__ m = {} diff --git a/tests/test_parsers.py b/tests/test_parsers.py index d14e3cde..890cda52 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -52,7 +52,7 @@ def setUp(self) -> None: self.alignmentxml_file = f"{test_data_dir}/oaei-ordo-hp.rdf" self.alignmentxml = minidom.parse(self.alignmentxml_file) - self.metadata, self.curie_map = get_default_metadata() + self.curie_map, self.metadata = get_default_metadata() def test_parse_sssom_dataframe(self): input_path = f"{test_data_dir}/basic.tsv" diff --git a/tox.ini b/tox.ini index 46acdbde..ae7fb47a 100644 --- a/tox.ini +++ b/tox.ini @@ -7,6 +7,7 @@ envlist = lint flake8 + mypy py [testenv] @@ -40,3 +41,9 @@ deps = flake8-bugbear flake8-isort description = Run the flake8 code quality checker. + +[testenv:mypy] +deps = mypy +skip_install = true +commands = mypy --install-types --non-interactive --ignore-missing-imports sssom/ setup.py +description = Run the mypy tool to check static typing on the project.