Skip to content

Commit

Permalink
Merge pull request #40 from meaningfy-ws/feature/lam-181
Browse files Browse the repository at this point in the history
Feature/lam 181
  • Loading branch information
mclaurentiu authored Feb 27, 2021
2 parents 0a27f74 + e1f78a7 commit d32f44c
Show file tree
Hide file tree
Showing 8 changed files with 126 additions and 79 deletions.
4 changes: 2 additions & 2 deletions eds4jinja2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
__docformat__ = "restructuredtext en"

# The format of the __version__ line is matched by a regex in setup.py and /docs/conf.py
__version__ = "0.1.32"
__date__ = "2021-02-25"
__version__ = "0.1.33"
__date__ = "2021-02-26"

import logging

Expand Down
16 changes: 6 additions & 10 deletions eds4jinja2/adapters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,19 @@ def sort_by_size_and_alphabet(l: List) -> List:
return sorted(l, key=lambda x: (len(x), x))


def first_key(d: Dict) -> object:
def first_key(d: (Dict, None)) -> object:
"""
Return the first dict key that from all the keys ordered first by their length and then alphabetically.
:param d:
:return:
"""
return sort_by_size_and_alphabet(d.keys())[0] if d else None
return sort_by_size_and_alphabet(list(d.keys()))[0] if d else None


def first_key_value(d: Dict) -> object:
def first_key_value(d: (Dict, None)) -> object:
"""
Return the dict value for the first key in the dict;
The first key is determined using `first_key` function.
:param d:
:return:
"""
return d[first_key(d)]
return d[first_key(d)] if d else None


def invert_dict(mapping_dict: Dict, reduce_values: bool = True):
Expand All @@ -44,7 +40,7 @@ def invert_dict(mapping_dict: Dict, reduce_values: bool = True):
The list can be reduced to single item by setting reduce_values=True.
>>> d = {"a":1, "b":2, c:1}
>>> d = {"a":1, "b":2, "c":1}
>>> reduced_d = invert_dict(d)
{1: 'a', 2: 'b'}
Expand Down Expand Up @@ -77,4 +73,4 @@ def deep_update(source, overrides):
source[key] = returned
else:
source[key] = overrides[key]
return source
return source
112 changes: 73 additions & 39 deletions eds4jinja2/adapters/namespace_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,14 @@
"""

import logging
from pprint import pprint
from typing import Dict, List
import re
from typing import List

import numpy as np
import rdflib
from pandas import DataFrame

from eds4jinja2.adapters import invert_dict
from eds4jinja2.adapters.prefix_cc_fetcher import prefix_cc_lookup_base_uri
from eds4jinja2.adapters.prefix_cc_fetcher import prefix_cc_lookup_base_uri, prefix_cc_lookup_prefix

logger = logging.getLogger(__name__)

Expand All @@ -45,47 +44,14 @@ def namespaces_as_dict(self):
"""
return {prefix: ns_uri.toPython() for prefix, ns_uri in self.namespaces()}

def simplify_uris_in_tabular(self, data_frame: DataFrame, target_columns: List = None,
prefix_cc_lookup=True, inplace=True, error_fail=True) -> Dict:
"""
Replace the full URIs by their qname counterparts. Discover the namespaces
in the process, if the namespaces are not defined.
:param error_fail: fail on error or throw exception per data_fame cell
:param inplace: indicate whether the current data_frame shall be modified or a new one be created instead
:param prefix_cc_lookup:
:param target_columns: the target columns to explore;
Expectation is that these columns exclusively contain only URIs as values
:param data_frame: the dataframe to explore
:return: dictionary with newly discovered namespace definitions
"""
if not target_columns:
target_columns = []

for col in target_columns:
if col not in data_frame.columns.values.tolist():
raise ValueError("The target column not found in the data frame")
# get all the string columns
obj_columns = data_frame.select_dtypes([np.object]).columns # [1:]
# limit to columns indicated in the self.target_columns
obj_columns = filter(lambda x: x in target_columns, obj_columns) if target_columns else obj_columns

# copy the dataframe if needed
result_frame = data_frame if inplace else data_frame.copy(deep=True)
for column in obj_columns:
#
result_frame[column] = result_frame[column].apply(
lambda x: self.uri_to_qname(x, prefix_cc_lookup=prefix_cc_lookup, error_fail=error_fail))
return result_frame

def uri_to_qname(self, uri_string, prefix_cc_lookup=True, error_fail=True):
def uri_to_qname(self, uri_string, prefix_cc_lookup=True, error_fail=False):
"""
Transform the uri_string to a qname string and remember the namespace.
If the namespace is not defined, the prefix can be looked up on prefix.cc
:param error_fail: whether the errors shall fail hard or just issue a warning
:param prefix_cc_lookup: whether to lookup a namespace on prefix.cc in case it is unknown or not.
:param uri_string: the string of a URI to be reduced to a QName
:return:
:return: qname string
"""
try:
computed_ns = self.compute_qname_strict(uri_string)
Expand All @@ -104,3 +70,71 @@ def uri_to_qname(self, uri_string, prefix_cc_lookup=True, error_fail=True):
raise e

return uri_string

def qname_to_uri(self, qname_string: str, prefix_cc_lookup=True, error_fail=False) -> str:
"""
Transform the QName into an URI
:param qname_string: the qname string to be expanded to URI
:param error_fail: whether the errors shall fail hard or just issue a warning
:param prefix_cc_lookup: whetehr to look for missing prefixes at the http://prefix.xx
:param error_fail: shall the error fail hard or pass with a warning
:return: the absolute URI string
"""
try:
if not re.search(r"^[\w\d.\-_]+:[\w\d.\-_]+$", qname_string):
raise ValueError('Not a QName for the form prefix:localname string: ' + qname_string)
s = qname_string.split(':')
prefix, local_name = s[0], s[1]

if prefix not in self.namespaces_as_dict():
if prefix_cc_lookup:
lookup_result = prefix_cc_lookup_prefix(prefix=prefix)
if lookup_result:
for prefix, namespace in lookup_result.items(): # expecting at most one result
self.bind(prefix=prefix, namespace=namespace, override=True, replace=True)
self.reset()
else:
raise ValueError('Unknown prefix: ' + prefix)
else:
raise ValueError('Unknown prefix: ' + prefix)
return self.namespaces_as_dict()[prefix] + local_name
except Exception as e:
logger.warning(f"Could not transform the QName <{qname_string}> to its absolute URI form.")
if error_fail:
raise e
return qname_string


def simplify_uris_in_tabular(data_frame: DataFrame, namespace_inventory: NamespaceInventory,
target_columns: List = None,
prefix_cc_lookup=True, inplace=True, error_fail=True) -> DataFrame:
"""
Replace the full URIs by their qname counterparts. Discover the namespaces
in the process, if the namespaces are not defined.
:param namespace_inventory: the namespace inventory to be used for replacement resolution
:param error_fail: fail on error or throw exception per data_fame cell
:param inplace: indicate whether the current data_frame shall be modified or a new one be created instead
:param prefix_cc_lookup:
:param target_columns: the target columns to explore;
Expectation is that these columns exclusively contain only URIs as values
:param data_frame: the dataframe to explore
:return: the DataFrame with replaced values
"""
if not target_columns:
target_columns = []

for col in target_columns:
if col not in data_frame.columns.values.tolist():
raise ValueError("The target column not found in the data frame")
# get all the string columns
obj_columns = data_frame.select_dtypes([object]).columns
# limit to columns indicated in the self.target_columns
obj_columns = filter(lambda x: x in target_columns, obj_columns) if target_columns else obj_columns

# copy the dataframe if needed
result_frame = data_frame if inplace else data_frame.copy(deep=True)
for column in obj_columns:
result_frame[column] = result_frame[column].apply(
lambda x: namespace_inventory.uri_to_qname(x, prefix_cc_lookup=prefix_cc_lookup, error_fail=error_fail))
return result_frame
13 changes: 4 additions & 9 deletions eds4jinja2/adapters/prefix_cc_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

""" """
import json
from typing import Dict

import requests

Expand All @@ -17,23 +18,18 @@
PREFIX_CC_REVERSE_LOOKUP_URL = "http://prefix.cc/reverse"


def prefix_cc_lookup_prefix(prefix: str) -> str:
def prefix_cc_lookup_prefix(prefix: str) -> Dict:
"""
Lookup a prefix at prefix.cc API and return the base namespace.
:param prefix:
:return: the namespace definition
"""
response = requests.get(url=PREFIX_CC_LOOKUP_URL + f"{prefix}.file.json")
# response.raise_for_status()
return json.loads(response.content) if response.content else None


def prefix_cc_lookup_base_uri(base_uri: str) -> str:
def prefix_cc_lookup_base_uri(base_uri: str) -> Dict:
"""
Lookup a base namespace on prefix.cc API and return the first prefix (shortest and first in an ordered list).
If the base_uri is not in the namespace definitions then return None.
:param base_uri:
:return: the prefix
"""
payload = {"uri": base_uri, "format": "json"}
response = requests.get(url=PREFIX_CC_REVERSE_LOOKUP_URL, params=payload)
Expand All @@ -42,10 +38,9 @@ def prefix_cc_lookup_base_uri(base_uri: str) -> str:
return namespaces if base_uri in invert_dict(namespaces) else None


def prefix_cc_all() -> dict:
def prefix_cc_all() -> Dict:
"""
Return all definitions from the prefix.cc
:return:
"""
response = requests.get(url=PREFIX_CC_ALL_JSON)
return json.loads(response.content) if response.ok else None
2 changes: 1 addition & 1 deletion eds4jinja2/adapters/tabular_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def replace_strings_in_tabular(data_frame: pd.DataFrame, target_columns: List[st
if col not in data_frame.columns.values.tolist():
raise ValueError("The target column not found in the data frame")
# get all the string columns
obj_columns = data_frame.select_dtypes([np.object]).columns # [1:]
obj_columns = data_frame.select_dtypes([object]).columns # [1:]
# columns = self.target_columns if self.target_columns else self.data_frame.columns
# limit to columns indicated in the self.target_columns
if target_columns:
Expand Down
28 changes: 19 additions & 9 deletions eds4jinja2/builders/jinja_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from eds4jinja2.adapters.file_ds import FileDataSource
from eds4jinja2.adapters.latex_utils import escape_latex
from eds4jinja2.adapters.local_sparql_ds import RDFFileDataSource
from eds4jinja2.adapters.namespace_handler import NamespaceInventory
from eds4jinja2.adapters.namespace_handler import NamespaceInventory, simplify_uris_in_tabular
from eds4jinja2.adapters.remote_sparql_ds import RemoteSPARQLEndpointDataSource
from eds4jinja2.adapters.tabular_utils import replace_strings_in_tabular, add_relative_figures

Expand All @@ -23,16 +23,26 @@

TABULAR_HELPERS = {
"invert_dict": lambda mapping_dict, reduce_values=True: invert_dict(mapping_dict, reduce_values),
"replace_strings_in_tabular": lambda data_frame, target_columns, value_mapping_dict,
mark_touched_rows=False: replace_strings_in_tabular(data_frame,
target_columns,
value_mapping_dict,
mark_touched_rows),
"add_relative_figures": lambda data_frame, target_columns, relativisers,
percentage=True: add_relative_figures(data_frame, target_columns, relativisers,
percentage),
"replace_strings_in_tabular": lambda data_frame, target_columns,
value_mapping_dict, mark_touched_rows=False: replace_strings_in_tabular(
data_frame,
target_columns,
value_mapping_dict,
mark_touched_rows),
"add_relative_figures": lambda data_frame, target_columns, relativisers, percentage=True: add_relative_figures(
data_frame,
target_columns,
relativisers, percentage),
"namespace_inventory": lambda namespace_definition_dict: NamespaceInventory(
namespace_definition_dict=namespace_definition_dict),
"simplify_uri_columns_in_tabular": lambda data_frame, namespace_inventory, target_columns=None, prefix_cc_lookup=True,
inplace=True, error_fail=True: simplify_uris_in_tabular(
data_frame,
namespace_inventory,
target_columns,
prefix_cc_lookup,
inplace,
error_fail)
}

TREE_HELPERS = {}
Expand Down
2 changes: 1 addition & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@
{% set content, error = from_file(path).fetch_tabular() %} \n
{% set ni = namespace_inventory({}) %}
content: {{ ni.simplify_uris_in_tabular(content, error_fail=False) }}\n
content: {{ simplify_uri_columns_in_tabular(content, ni, error_fail=False) }}\n
error: {{ error }}\n
namespace definitions: {{ ni.namespaces_as_dict() | tojson }}
'''
Expand Down
28 changes: 20 additions & 8 deletions tests/unit/test_namespace_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
# Email: costezki.eugen@gmail.com

""" """
from pprint import pprint

import pytest

from eds4jinja2.adapters import first_key, first_key_value, invert_dict
from eds4jinja2.adapters.namespace_handler import NamespaceInventory
from eds4jinja2.adapters.namespace_handler import NamespaceInventory, simplify_uris_in_tabular
from eds4jinja2.adapters.prefix_cc_fetcher import prefix_cc_lookup_prefix, prefix_cc_lookup_base_uri, prefix_cc_all


Expand Down Expand Up @@ -72,7 +73,7 @@ def test_uri_to_qname(dummy_prefixes):

def test_simplify_uri_to_qname_open(dummy_df):
ni = NamespaceInventory()
ni.simplify_uris_in_tabular(dummy_df, target_columns=["s", "p", "o"])
simplify_uris_in_tabular(dummy_df, namespace_inventory=ni, target_columns=["s", "p", "o"])

ns_inv = ni.namespaces_as_dict()
assert "ns1" in ns_inv
Expand All @@ -87,7 +88,7 @@ def test_simplify_uri_to_qname_open(dummy_df):

def test_simplify_uri_to_qname_close(dummy_df):
ni = NamespaceInventory()
ni.simplify_uris_in_tabular(dummy_df, prefix_cc_lookup=False, target_columns=["s", "p", "o"])
simplify_uris_in_tabular(dummy_df, namespace_inventory=ni, prefix_cc_lookup=False, target_columns=["s", "p", "o"])

ns_inv = ni.namespaces_as_dict()
assert "ns1" in ns_inv
Expand All @@ -97,11 +98,22 @@ def test_simplify_uri_to_qname_close(dummy_df):


def test_new_namespace_inventory(dummy_prefixes):
nm = NamespaceInventory(dummy_prefixes)
assert nm.uri_to_qname(
ni = NamespaceInventory(dummy_prefixes)
assert ni.uri_to_qname(
"http://publications.europa.eu/resource/authority/corporate-body/COB1") == "corporate-body:COB1"
assert nm.uri_to_qname(
assert ni.uri_to_qname(
"http://publications.e67u/resource/authority/corporate-body/cmdfg34") == "ns1:cmdfg34"
assert nm.uri_to_qname("http://www.w3.org/2004/02/skos/core#Concept") == "skos:Concept"
assert ni.uri_to_qname("http://www.w3.org/2004/02/skos/core#Concept") == "skos:Concept"


def test_qname_to_uri(dummy_prefixes):
ni = NamespaceInventory(dummy_prefixes)
assert ni.qname_to_uri(qname_string="dct:date", error_fail=False) == "http://purl.org/dc/terms/date"
assert ni.qname_to_uri(qname_string="rdf:type",
error_fail=False) == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
assert ni.qname_to_uri(qname_string="http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
error_fail=False) == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
assert ni.qname_to_uri(qname_string="bowlik", error_fail=False) == "bowlik"

with pytest.raises(ValueError):
assert ni.qname_to_uri(qname_string="bowlik", error_fail=True) == "bowlik"

0 comments on commit d32f44c

Please sign in to comment.