Skip to content

Commit

Permalink
Merge pull request #39 from meaningfy-ws/feature/lam-126
Browse files Browse the repository at this point in the history
A set of multiple improvements and implementations
  • Loading branch information
costezki authored Feb 26, 2021
2 parents dae9992 + 7e1c835 commit 0a27f74
Show file tree
Hide file tree
Showing 81 changed files with 3,199 additions and 12,364 deletions.
4 changes: 0 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@ install:
@ pip install -r requirements.txt
@ pip install -r requirements-dev.txt

lint:
@ echo "$(BUILD_PRINT)Linting the code"
@ flake8 || true

test:
@ echo "$(BUILD_PRINT)Running the tests"
@ pytest
Expand Down
11 changes: 6 additions & 5 deletions eds4jinja2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,26 @@

# The format of the __version__ line is matched by a regex in setup.py and /docs/conf.py
__version__ = "0.1.32"
__date__ = "2021-02-23"
__date__ = "2021-02-25"

import logging

from eds4jinja2.adapters.local_sparql_ds import RDFFileDataSource
from eds4jinja2.builders.jinja_builder import build_eds_environment, inject_environment_globals
from eds4jinja2.adapters.file_ds import FileDataSource
from eds4jinja2.adapters.local_sparql_ds import RDFFileDataSource
from eds4jinja2.adapters.namespace_handler import NamespaceInventory
from eds4jinja2.adapters.remote_sparql_ds import RemoteSPARQLEndpointDataSource
from eds4jinja2.adapters.tabular_utils import invert_dict, add_relative_figures, replace_strings_in_tabular
from eds4jinja2.adapters.tabular_utils import add_relative_figures, replace_strings_in_tabular
from eds4jinja2.builders.jinja_builder import build_eds_environment, inject_environment_globals

# Making usage of this library more convenient
__all__ = ["build_eds_environment",
"inject_environment_globals",
"FileDataSource",
"RemoteSPARQLEndpointDataSource",
"RDFFileDataSource",
"invert_dict",
"add_relative_figures",
"replace_strings_in_tabular",
"NamespaceInventory"
]

# hard coding the log level and format
Expand Down
74 changes: 74 additions & 0 deletions eds4jinja2/adapters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,77 @@
# Date: 07/08/2020
# Author: Eugeniu Costetchi
# Email: costezki.eugen@gmail.com
import collections
from typing import List, Dict


def sort_by_size_and_alphabet(l: List) -> List:
"""
Sort an iterable by size and alphabetically
:param l:
:return:
"""
return sorted(l, key=lambda x: (len(x), x))


def first_key(d: Dict) -> object:
"""
Return the first dict key that from all the keys ordered first by their length and then alphabetically.
:param d:
:return:
"""
return sort_by_size_and_alphabet(d.keys())[0] if d else None


def first_key_value(d: Dict) -> object:
"""
Return the dict value for the first key in the dict;
The first key is determined using `first_key` function.
:param d:
:return:
"""
return d[first_key(d)]


def invert_dict(mapping_dict: Dict, reduce_values: bool = True):
"""
Invert the dictionary by swapping keys and values. In case the values are unique then the inverted dict will be
of the same size as the initial one. Otherwise it will be shrunk to the unique values and the keys will be
cumulated in a list.
The list can be reduced to single item by setting reduce_values=True.
>>> d = {"a":1, "b":2, c:1}
>>> reduced_d = invert_dict(d)
{1: 'a', 2: 'b'}
>>> unreduced_d = invert_dict(d, False)
{1: ['a', 'c'], 2: ['b']}
:param reduce_values: If reduce_values is true then the values are single items otherwise
the values are list of possibly multiple items.
:type mapping_dict: a dictionary to be inverted
"""
inv_map = {}
for k, v in mapping_dict.items():
inv_map[v] = inv_map.get(v, [])
inv_map[v].append(k)
if reduce_values:
return {k: sort_by_size_and_alphabet(v)[0] for k, v in inv_map.items()}
return inv_map


def deep_update(source, overrides):
"""
Update a nested dictionary or similar mapping.
Modify ``source`` in place.
Used from https://stackoverflow.com/questions/3232943/update-value-of-a-nested-dictionary-of-varying-depth
"""
for key, value in overrides.items():
if isinstance(value, collections.Mapping) and value:
returned = deep_update(source.get(key, {}), value)
source[key] = returned
else:
source[key] = overrides[key]
return source
106 changes: 106 additions & 0 deletions eds4jinja2/adapters/namespace_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/usr/bin/python3

# namespace_handler.py
# Date: 25/02/2021
# Author: Eugeniu Costetchi
# Email: costezki.eugen@gmail.com

"""
This module deals with namespace management over Pandas DataFrames.
- Discovery of prefixes and namespaces in a DataFrame
- prefix.cc lookup
- Maintenance of a namespace inventory
- Shortening of URIs to their QName forms
"""

import logging
from pprint import pprint
from typing import Dict, List

import numpy as np
import rdflib
from pandas import DataFrame

from eds4jinja2.adapters import invert_dict
from eds4jinja2.adapters.prefix_cc_fetcher import prefix_cc_lookup_base_uri

logger = logging.getLogger(__name__)


class NamespaceInventory(rdflib.namespace.NamespaceManager):

def __init__(self, namespace_definition_dict=None):
super().__init__(rdflib.Graph())

if namespace_definition_dict:
# reduce the namespace definition dictionary and bind the definitions
for prefix, namespace in invert_dict(invert_dict(namespace_definition_dict)).items():
self.bind(prefix=prefix, namespace=namespace, replace=True, override=True)

self._remote_query_cash = []

def namespaces_as_dict(self):
"""
:return: return the namespace definitions as a dict
"""
return {prefix: ns_uri.toPython() for prefix, ns_uri in self.namespaces()}

def simplify_uris_in_tabular(self, data_frame: DataFrame, target_columns: List = None,
prefix_cc_lookup=True, inplace=True, error_fail=True) -> Dict:
"""
Replace the full URIs by their qname counterparts. Discover the namespaces
in the process, if the namespaces are not defined.
:param error_fail: fail on error or throw exception per data_fame cell
:param inplace: indicate whether the current data_frame shall be modified or a new one be created instead
:param prefix_cc_lookup:
:param target_columns: the target columns to explore;
Expectation is that these columns exclusively contain only URIs as values
:param data_frame: the dataframe to explore
:return: dictionary with newly discovered namespace definitions
"""
if not target_columns:
target_columns = []

for col in target_columns:
if col not in data_frame.columns.values.tolist():
raise ValueError("The target column not found in the data frame")
# get all the string columns
obj_columns = data_frame.select_dtypes([np.object]).columns # [1:]
# limit to columns indicated in the self.target_columns
obj_columns = filter(lambda x: x in target_columns, obj_columns) if target_columns else obj_columns

# copy the dataframe if needed
result_frame = data_frame if inplace else data_frame.copy(deep=True)
for column in obj_columns:
#
result_frame[column] = result_frame[column].apply(
lambda x: self.uri_to_qname(x, prefix_cc_lookup=prefix_cc_lookup, error_fail=error_fail))
return result_frame

def uri_to_qname(self, uri_string, prefix_cc_lookup=True, error_fail=True):
"""
Transform the uri_string to a qname string and remember the namespace.
If the namespace is not defined, the prefix can be looked up on prefix.cc
:param error_fail: whether the errors shall fail hard or just issue a warning
:param prefix_cc_lookup: whether to lookup a namespace on prefix.cc in case it is unknown or not.
:param uri_string: the string of a URI to be reduced to a QName
:return:
"""
try:
computed_ns = self.compute_qname_strict(uri_string)
base_uri = computed_ns[1].toPython()
if prefix_cc_lookup and base_uri not in self._remote_query_cash:
self._remote_query_cash.append(base_uri)
lookup_result = prefix_cc_lookup_base_uri(base_uri=base_uri)
if lookup_result:
for prefix, namespace in lookup_result.items(): # expecting at most one result
self.bind(prefix=prefix, namespace=namespace, override=True, replace=True)
self.reset()
return self.qname_strict(uri_string)
except Exception as e:
logger.warning(f"Could not transform the URI <{uri_string}> to its QName form.")
if error_fail:
raise e

return uri_string
51 changes: 51 additions & 0 deletions eds4jinja2/adapters/prefix_cc_fetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/python3

# prefix_cc_fetcher.py
# Date: 25/02/2021
# Author: Eugeniu Costetchi
# Email: costezki.eugen@gmail.com

""" """
import json

import requests

from eds4jinja2.adapters import invert_dict

PREFIX_CC_ALL_JSON = "http://prefix.cc/popular/all.file.json"
PREFIX_CC_LOOKUP_URL = "http://prefix.cc/"
PREFIX_CC_REVERSE_LOOKUP_URL = "http://prefix.cc/reverse"


def prefix_cc_lookup_prefix(prefix: str) -> str:
"""
Lookup a prefix at prefix.cc API and return the base namespace.
:param prefix:
:return: the namespace definition
"""
response = requests.get(url=PREFIX_CC_LOOKUP_URL + f"{prefix}.file.json")
# response.raise_for_status()
return json.loads(response.content) if response.content else None


def prefix_cc_lookup_base_uri(base_uri: str) -> str:
"""
Lookup a base namespace on prefix.cc API and return the first prefix (shortest and first in an ordered list).
If the base_uri is not in the namespace definitions then return None.
:param base_uri:
:return: the prefix
"""
payload = {"uri": base_uri, "format": "json"}
response = requests.get(url=PREFIX_CC_REVERSE_LOOKUP_URL, params=payload)
if response.ok:
namespaces = json.loads(response.content)
return namespaces if base_uri in invert_dict(namespaces) else None


def prefix_cc_all() -> dict:
"""
Return all definitions from the prefix.cc
:return:
"""
response = requests.get(url=PREFIX_CC_ALL_JSON)
return json.loads(response.content) if response.ok else None
30 changes: 27 additions & 3 deletions eds4jinja2/adapters/remote_sparql_ds.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from typing import Optional

from SPARQLWrapper import SPARQLWrapper, JSON, CSV
from py_singleton import singleton

from eds4jinja2.adapters.base_data_source import DataSource
import pandas as pd
Expand All @@ -20,6 +21,27 @@
DEFAULT_ENCODING = 'utf-8'


@singleton
class SPARQLClientPool(object):
"""
A singleton connection pool, that hosts a dictionary of endpoint_urls and
a corresponding SPARQLWrapper object connecting to it.
The rationale of this connection pool is to reuse connection objects and save time.
"""
connection_pool = {}

@staticmethod
def create_or_reuse_connection(endpoint_url: str):
if endpoint_url not in SPARQLClientPool.connection_pool:
SPARQLClientPool.connection_pool[endpoint_url] = SPARQLWrapper(endpoint_url)
return SPARQLClientPool.connection_pool[endpoint_url]


# safe instantiation
SPARQLClientPool.instance()


class RemoteSPARQLEndpointDataSource(DataSource):
"""
Fetches data from SPARQL endpoint. Can be used either with a SPARQL query or a URI to be described.
Expand All @@ -46,11 +68,12 @@ class RemoteSPARQLEndpointDataSource(DataSource):
"""

def __init__(self, endpoint_url):
self.endpoint = SPARQLWrapper(endpoint_url)
self.endpoint = SPARQLClientPool.create_or_reuse_connection(endpoint_url)
self.__can_be_tree = True
self.__can_be_tabular = True

def with_query(self, sparql_query: str, substitution_variables: dict = None, sparql_prefixes: str = "") -> 'RemoteSPARQLEndpointDataSource':
def with_query(self, sparql_query: str, substitution_variables: dict = None,
sparql_prefixes: str = "") -> 'RemoteSPARQLEndpointDataSource':
"""
Set the query text and return the reference to self for chaining.
:return:
Expand All @@ -64,7 +87,8 @@ def with_query(self, sparql_query: str, substitution_variables: dict = None, spa
self.endpoint.setQuery(new_query)
return self

def with_query_from_file(self, sparql_query_file_path: str, substitution_variables: dict = None, prefixes: str = "") -> 'RemoteSPARQLEndpointDataSource':
def with_query_from_file(self, sparql_query_file_path: str, substitution_variables: dict = None,
prefixes: str = "") -> 'RemoteSPARQLEndpointDataSource':
"""
Set the query text and return the reference to self for chaining.
:return:
Expand Down
Loading

0 comments on commit 0a27f74

Please sign in to comment.