Merge pull request #40 from meaningfy-ws/feature/lam-181

Feature/lam 181
meaningfy-ws · Feb 27, 2021 · d32f44c · d32f44c
2 parents 0a27f74 + e1f78a7
commit d32f44c
Show file tree

Hide file tree

Showing 8 changed files with 126 additions and 79 deletions.
diff --git a/eds4jinja2/__init__.py b/eds4jinja2/__init__.py
@@ -8,8 +8,8 @@
 __docformat__ = "restructuredtext en"
 
 # The format of the __version__ line is matched by a regex in setup.py and /docs/conf.py
-__version__ = "0.1.32"
-__date__ = "2021-02-25"
+__version__ = "0.1.33"
+__date__ = "2021-02-26"
 
 import logging
 

diff --git a/eds4jinja2/adapters/__init__.py b/eds4jinja2/adapters/__init__.py
@@ -17,23 +17,19 @@ def sort_by_size_and_alphabet(l: List) -> List:
     return sorted(l, key=lambda x: (len(x), x))
 
 
-def first_key(d: Dict) -> object:
+def first_key(d: (Dict, None)) -> object:
     """
         Return the first dict key that from all the keys ordered first by their length and then alphabetically.
-    :param d:
-    :return:
     """
-    return sort_by_size_and_alphabet(d.keys())[0] if d else None
+    return sort_by_size_and_alphabet(list(d.keys()))[0] if d else None
 
 
-def first_key_value(d: Dict) -> object:
+def first_key_value(d: (Dict, None)) -> object:
     """
         Return the dict value for the first key in the dict;
         The first key is determined using `first_key` function.
-    :param d:
-    :return:
     """
-    return d[first_key(d)]
+    return d[first_key(d)] if d else None
 
 
 def invert_dict(mapping_dict: Dict, reduce_values: bool = True):
@@ -44,7 +40,7 @@ def invert_dict(mapping_dict: Dict, reduce_values: bool = True):
 
         The list can be reduced to single item by setting reduce_values=True.
 
-        >>> d = {"a":1, "b":2, c:1}
+        >>> d = {"a":1, "b":2, "c":1}
         >>> reduced_d = invert_dict(d)
         {1: 'a', 2: 'b'}
 
@@ -77,4 +73,4 @@ def deep_update(source, overrides):
             source[key] = returned
         else:
             source[key] = overrides[key]
-    return source
+    return source
diff --git a/eds4jinja2/adapters/namespace_handler.py b/eds4jinja2/adapters/namespace_handler.py
@@ -14,15 +14,14 @@
 """
 
 import logging
-from pprint import pprint
-from typing import Dict, List
+import re
+from typing import List
 
-import numpy as np
 import rdflib
 from pandas import DataFrame
 
 from eds4jinja2.adapters import invert_dict
-from eds4jinja2.adapters.prefix_cc_fetcher import prefix_cc_lookup_base_uri
+from eds4jinja2.adapters.prefix_cc_fetcher import prefix_cc_lookup_base_uri, prefix_cc_lookup_prefix
 
 logger = logging.getLogger(__name__)
 
@@ -45,47 +44,14 @@ def namespaces_as_dict(self):
         """
         return {prefix: ns_uri.toPython() for prefix, ns_uri in self.namespaces()}
 
-    def simplify_uris_in_tabular(self, data_frame: DataFrame, target_columns: List = None,
-                                 prefix_cc_lookup=True, inplace=True, error_fail=True) -> Dict:
-        """
-            Replace the full URIs by their qname counterparts. Discover the namespaces
-            in the process, if the namespaces are not defined.
-
-        :param error_fail: fail on error or throw exception per data_fame cell
-        :param inplace: indicate whether the current data_frame shall be modified or a new one be created instead
-        :param prefix_cc_lookup:
-        :param target_columns: the target columns to explore;
-                                    Expectation is that these columns exclusively contain only URIs as values
-        :param data_frame: the dataframe to explore
-        :return:  dictionary with newly discovered namespace definitions
-        """
-        if not target_columns:
-            target_columns = []
-
-        for col in target_columns:
-            if col not in data_frame.columns.values.tolist():
-                raise ValueError("The target column not found in the data frame")
-        # get all the string columns
-        obj_columns = data_frame.select_dtypes([np.object]).columns  # [1:]
-        # limit to columns indicated in the self.target_columns
-        obj_columns = filter(lambda x: x in target_columns, obj_columns) if target_columns else obj_columns
-
-        # copy the dataframe if needed
-        result_frame = data_frame if inplace else data_frame.copy(deep=True)
-        for column in obj_columns:
-            #
-            result_frame[column] = result_frame[column].apply(
-                lambda x: self.uri_to_qname(x, prefix_cc_lookup=prefix_cc_lookup, error_fail=error_fail))
-        return result_frame
-
-    def uri_to_qname(self, uri_string, prefix_cc_lookup=True, error_fail=True):
+    def uri_to_qname(self, uri_string, prefix_cc_lookup=True, error_fail=False):
         """
             Transform the uri_string to a qname string and remember the namespace.
             If the namespace is not defined, the prefix can be looked up on prefix.cc
         :param error_fail: whether the errors shall fail hard or just issue a warning
         :param prefix_cc_lookup: whether to lookup a namespace on prefix.cc in case it is unknown or not.
         :param uri_string: the string of a URI to be reduced to a QName
-        :return:
+        :return: qname string
         """
         try:
             computed_ns = self.compute_qname_strict(uri_string)
@@ -104,3 +70,71 @@ def uri_to_qname(self, uri_string, prefix_cc_lookup=True, error_fail=True):
                 raise e
 
             return uri_string
+
+    def qname_to_uri(self, qname_string: str, prefix_cc_lookup=True, error_fail=False) -> str:
+        """
+            Transform the QName into an URI
+        :param qname_string: the qname string to be expanded to URI
+        :param error_fail: whether the errors shall fail hard or just issue a warning
+        :param prefix_cc_lookup: whetehr to look for missing prefixes at the http://prefix.xx
+        :param error_fail: shall the error fail hard or pass with a warning
+        :return: the absolute URI string
+        """
+        try:
+            if not re.search(r"^[\w\d.\-_]+:[\w\d.\-_]+$", qname_string):
+                raise ValueError('Not a QName for the form prefix:localname string: ' + qname_string)
+            s = qname_string.split(':')
+            prefix, local_name = s[0], s[1]
+
+            if prefix not in self.namespaces_as_dict():
+                if prefix_cc_lookup:
+                    lookup_result = prefix_cc_lookup_prefix(prefix=prefix)
+                    if lookup_result:
+                        for prefix, namespace in lookup_result.items():  # expecting at most one result
+                            self.bind(prefix=prefix, namespace=namespace, override=True, replace=True)
+                        self.reset()
+                    else:
+                        raise ValueError('Unknown prefix: ' + prefix)
+                else:
+                    raise ValueError('Unknown prefix: ' + prefix)
+            return self.namespaces_as_dict()[prefix] + local_name
+        except Exception as e:
+            logger.warning(f"Could not transform the QName <{qname_string}> to its absolute URI form.")
+            if error_fail:
+                raise e
+            return qname_string
+
+
+def simplify_uris_in_tabular(data_frame: DataFrame, namespace_inventory: NamespaceInventory,
+                             target_columns: List = None,
+                             prefix_cc_lookup=True, inplace=True, error_fail=True) -> DataFrame:
+    """
+        Replace the full URIs by their qname counterparts. Discover the namespaces
+        in the process, if the namespaces are not defined.
+
+    :param namespace_inventory: the namespace inventory to be used for replacement resolution
+    :param error_fail: fail on error or throw exception per data_fame cell
+    :param inplace: indicate whether the current data_frame shall be modified or a new one be created instead
+    :param prefix_cc_lookup:
+    :param target_columns: the target columns to explore;
+                                Expectation is that these columns exclusively contain only URIs as values
+    :param data_frame: the dataframe to explore
+    :return:  the DataFrame with replaced values
+    """
+    if not target_columns:
+        target_columns = []
+
+    for col in target_columns:
+        if col not in data_frame.columns.values.tolist():
+            raise ValueError("The target column not found in the data frame")
+    # get all the string columns
+    obj_columns = data_frame.select_dtypes([object]).columns
+    # limit to columns indicated in the self.target_columns
+    obj_columns = filter(lambda x: x in target_columns, obj_columns) if target_columns else obj_columns
+
+    # copy the dataframe if needed
+    result_frame = data_frame if inplace else data_frame.copy(deep=True)
+    for column in obj_columns:
+        result_frame[column] = result_frame[column].apply(
+            lambda x: namespace_inventory.uri_to_qname(x, prefix_cc_lookup=prefix_cc_lookup, error_fail=error_fail))
+    return result_frame
diff --git a/eds4jinja2/adapters/prefix_cc_fetcher.py b/eds4jinja2/adapters/prefix_cc_fetcher.py
@@ -7,6 +7,7 @@
 
 """ """
 import json
+from typing import Dict
 
 import requests
 
@@ -17,23 +18,18 @@
 PREFIX_CC_REVERSE_LOOKUP_URL = "http://prefix.cc/reverse"
 
 
-def prefix_cc_lookup_prefix(prefix: str) -> str:
+def prefix_cc_lookup_prefix(prefix: str) -> Dict:
     """
         Lookup a prefix at prefix.cc API and return the base namespace.
-    :param prefix:
-    :return: the namespace definition
     """
     response = requests.get(url=PREFIX_CC_LOOKUP_URL + f"{prefix}.file.json")
-    # response.raise_for_status()
     return json.loads(response.content) if response.content else None
 
 
-def prefix_cc_lookup_base_uri(base_uri: str) -> str:
+def prefix_cc_lookup_base_uri(base_uri: str) -> Dict:
     """
         Lookup a base namespace on prefix.cc API and return the first prefix (shortest and first in an ordered list).
         If the base_uri is not in the namespace definitions then return None.
-    :param base_uri:
-    :return: the prefix
     """
     payload = {"uri": base_uri, "format": "json"}
     response = requests.get(url=PREFIX_CC_REVERSE_LOOKUP_URL, params=payload)
@@ -42,10 +38,9 @@ def prefix_cc_lookup_base_uri(base_uri: str) -> str:
         return namespaces if base_uri in invert_dict(namespaces) else None
 
 
-def prefix_cc_all() -> dict:
+def prefix_cc_all() -> Dict:
     """
         Return all definitions from the prefix.cc
-    :return:
     """
     response = requests.get(url=PREFIX_CC_ALL_JSON)
     return json.loads(response.content) if response.ok else None
diff --git a/eds4jinja2/adapters/tabular_utils.py b/eds4jinja2/adapters/tabular_utils.py
@@ -43,7 +43,7 @@ def replace_strings_in_tabular(data_frame: pd.DataFrame, target_columns: List[st
         if col not in data_frame.columns.values.tolist():
             raise ValueError("The target column not found in the data frame")
     # get all the string columns
-    obj_columns = data_frame.select_dtypes([np.object]).columns  # [1:]
+    obj_columns = data_frame.select_dtypes([object]).columns  # [1:]
     # columns = self.target_columns if self.target_columns else self.data_frame.columns
     # limit to columns indicated in the self.target_columns
     if target_columns:

diff --git a/eds4jinja2/builders/jinja_builder.py b/eds4jinja2/builders/jinja_builder.py
@@ -11,7 +11,7 @@
 from eds4jinja2.adapters.file_ds import FileDataSource
 from eds4jinja2.adapters.latex_utils import escape_latex
 from eds4jinja2.adapters.local_sparql_ds import RDFFileDataSource
-from eds4jinja2.adapters.namespace_handler import NamespaceInventory
+from eds4jinja2.adapters.namespace_handler import NamespaceInventory, simplify_uris_in_tabular
 from eds4jinja2.adapters.remote_sparql_ds import RemoteSPARQLEndpointDataSource
 from eds4jinja2.adapters.tabular_utils import replace_strings_in_tabular, add_relative_figures
 
@@ -23,16 +23,26 @@
 
 TABULAR_HELPERS = {
     "invert_dict": lambda mapping_dict, reduce_values=True: invert_dict(mapping_dict, reduce_values),
-    "replace_strings_in_tabular": lambda data_frame, target_columns, value_mapping_dict,
-                                         mark_touched_rows=False: replace_strings_in_tabular(data_frame,
-                                                                                             target_columns,
-                                                                                             value_mapping_dict,
-                                                                                             mark_touched_rows),
-    "add_relative_figures": lambda data_frame, target_columns, relativisers,
-                                   percentage=True: add_relative_figures(data_frame, target_columns, relativisers,
-                                                                         percentage),
+    "replace_strings_in_tabular": lambda data_frame, target_columns,
+                                         value_mapping_dict, mark_touched_rows=False: replace_strings_in_tabular(
+        data_frame,
+        target_columns,
+        value_mapping_dict,
+        mark_touched_rows),
+    "add_relative_figures": lambda data_frame, target_columns, relativisers, percentage=True: add_relative_figures(
+        data_frame,
+        target_columns,
+        relativisers, percentage),
     "namespace_inventory": lambda namespace_definition_dict: NamespaceInventory(
         namespace_definition_dict=namespace_definition_dict),
+    "simplify_uri_columns_in_tabular": lambda data_frame, namespace_inventory, target_columns=None, prefix_cc_lookup=True,
+                                       inplace=True, error_fail=True: simplify_uris_in_tabular(
+        data_frame,
+        namespace_inventory,
+        target_columns,
+        prefix_cc_lookup,
+        inplace,
+        error_fail)
 }
 
 TREE_HELPERS = {}

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -285,7 +285,7 @@
 {% set content, error = from_file(path).fetch_tabular() %} \n
 {% set ni = namespace_inventory({}) %}
 
-content:  {{ ni.simplify_uris_in_tabular(content, error_fail=False) }}\n
+content:  {{ simplify_uri_columns_in_tabular(content, ni, error_fail=False) }}\n
 error: {{ error }}\n
 namespace definitions: {{ ni.namespaces_as_dict() | tojson }}
 '''

diff --git a/tests/unit/test_namespace_handlers.py b/tests/unit/test_namespace_handlers.py
@@ -6,10 +6,11 @@
 # Email: costezki.eugen@gmail.com 
 
 """ """
-from pprint import pprint
+
+import pytest
 
 from eds4jinja2.adapters import first_key, first_key_value, invert_dict
-from eds4jinja2.adapters.namespace_handler import NamespaceInventory
+from eds4jinja2.adapters.namespace_handler import NamespaceInventory, simplify_uris_in_tabular
 from eds4jinja2.adapters.prefix_cc_fetcher import prefix_cc_lookup_prefix, prefix_cc_lookup_base_uri, prefix_cc_all
 
 
@@ -72,7 +73,7 @@ def test_uri_to_qname(dummy_prefixes):
 
 def test_simplify_uri_to_qname_open(dummy_df):
     ni = NamespaceInventory()
-    ni.simplify_uris_in_tabular(dummy_df, target_columns=["s", "p", "o"])
+    simplify_uris_in_tabular(dummy_df, namespace_inventory=ni, target_columns=["s", "p", "o"])
 
     ns_inv = ni.namespaces_as_dict()
     assert "ns1" in ns_inv
@@ -87,7 +88,7 @@ def test_simplify_uri_to_qname_open(dummy_df):
 
 def test_simplify_uri_to_qname_close(dummy_df):
     ni = NamespaceInventory()
-    ni.simplify_uris_in_tabular(dummy_df, prefix_cc_lookup=False, target_columns=["s", "p", "o"])
+    simplify_uris_in_tabular(dummy_df, namespace_inventory=ni, prefix_cc_lookup=False, target_columns=["s", "p", "o"])
 
     ns_inv = ni.namespaces_as_dict()
     assert "ns1" in ns_inv
@@ -97,11 +98,22 @@ def test_simplify_uri_to_qname_close(dummy_df):
 
 
 def test_new_namespace_inventory(dummy_prefixes):
-    nm = NamespaceInventory(dummy_prefixes)
-    assert nm.uri_to_qname(
+    ni = NamespaceInventory(dummy_prefixes)
+    assert ni.uri_to_qname(
         "http://publications.europa.eu/resource/authority/corporate-body/COB1") == "corporate-body:COB1"
-    assert nm.uri_to_qname(
+    assert ni.uri_to_qname(
         "http://publications.e67u/resource/authority/corporate-body/cmdfg34") == "ns1:cmdfg34"
-    assert nm.uri_to_qname("http://www.w3.org/2004/02/skos/core#Concept") == "skos:Concept"
+    assert ni.uri_to_qname("http://www.w3.org/2004/02/skos/core#Concept") == "skos:Concept"
 
 
+def test_qname_to_uri(dummy_prefixes):
+    ni = NamespaceInventory(dummy_prefixes)
+    assert ni.qname_to_uri(qname_string="dct:date", error_fail=False) == "http://purl.org/dc/terms/date"
+    assert ni.qname_to_uri(qname_string="rdf:type",
+                           error_fail=False) == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
+    assert ni.qname_to_uri(qname_string="http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
+                           error_fail=False) == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
+    assert ni.qname_to_uri(qname_string="bowlik", error_fail=False) == "bowlik"
+
+    with pytest.raises(ValueError):
+        assert ni.qname_to_uri(qname_string="bowlik", error_fail=True) == "bowlik"