Skip to content

Commit

Permalink
Doi patch for citation field in uns (#74)
Browse files Browse the repository at this point in the history
* Updated version to 0.1.13

* Added dataset object to rdf representation

* Disabled supercluster_of removal to fix missing nodes in graph generation

* Added add_metadata_nodes method

* Rounded up percentages in add_metadata_nodes

* Refactored add_metadata_nodes to utilize OWL reification for improved visualisation in neo4j.

* Added parsing mechanism for citation field in uns
  • Loading branch information
ubyndr authored May 20, 2024
1 parent 76a37da commit 39c3824
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 1 deletion.
14 changes: 13 additions & 1 deletion pandasaurus_cxg/graph_generator/graph_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
from pandasaurus.graph.graph_generator import GraphGenerator as graphgen
from rdflib import OWL, RDF, RDFS, BNode, Graph, Literal, Namespace, URIRef
from rdflib.plugins.sparql import prepareQuery
Expand All @@ -19,9 +18,11 @@
add_edge,
add_node,
add_outgoing_edges_to_subgraph,
citation_field_name,
colour_mapping,
find_and_rotate_center_layout,
generate_subgraph,
parse_citation_field_into_dict,
remove_special_characters,
select_node_with_property,
)
Expand Down Expand Up @@ -129,6 +130,17 @@ def generate_rdf_graph(self):
for key, value in uns.items():
if not isinstance(value, str):
continue
if key == citation_field_name:
citation_dict = parse_citation_field_into_dict(value)
for citation_key, citation_value in citation_dict.items():
self.graph.add(
(
dataset_class,
URIRef(self.ns[citation_key]),
Literal(citation_value),
)
)

self.graph.add((dataset_class, URIRef(self.ns[key]), Literal(value)))
has_source = URIRef(HAS_SOURCE["iri"])
self.graph.add((has_source, RDFS.label, Literal(HAS_SOURCE["label"])))
Expand Down
32 changes: 32 additions & 0 deletions pandasaurus_cxg/graph_generator/graph_generator_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from typing import Dict

import networkx as nx
from rdflib import OWL, RDF, RDFS, BNode, Graph, Literal, Namespace, URIRef
Expand All @@ -14,6 +15,8 @@
"http://purl.obolibrary.org/obo/PCL_0010001": "cyan",
}

citation_field_name = "citation"


def add_edge(nx_graph: nx.Graph, subject, predicate, obj):
edge_data = {
Expand Down Expand Up @@ -126,3 +129,32 @@ def select_node_with_property(graph: Graph, _property: str, value: str):

def remove_special_characters(input_string: str) -> str:
return re.sub(r"[^a-zA-Z0-9_]", "", input_string.replace(" ", "_"))


def parse_citation_field_into_dict(value: str) -> Dict[str, str]:
"""
Parses a citation string into a dictionary by extracting key citation fields.
Args:
value: The string containing citation fields and values.
Returns:
A dictionary with keys such as 'Publication', 'Dataset Version', and 'Collection',
and corresponding values extracted from the input string.
"""
# Split the input string on the key terms
parts = value.split(" ")
keys = ["Publication:", "Version:", "Collection:"]
key_indices = [parts.index(key) for key in keys if key in parts]
# Break down into key-value pairs
key_value_pairs = {}
for i, index in enumerate(key_indices):
current_value = " ".join(parts[index + 1 : index + 2])
key_value_pairs.update(
{
"download_link"
if parts[index][:-1].lower() == "version"
else parts[index][:-1].lower(): current_value
}
)
return key_value_pairs

0 comments on commit 39c3824

Please sign in to comment.