Skip to content

Commit

Permalink
Resolved conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
ubyndr committed Aug 1, 2023
2 parents 54da01d + 786c681 commit 6f8d4fa
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 42 deletions.
5 changes: 3 additions & 2 deletions pandasaurus_cxg/anndata_analyzer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import itertools
import os
from enum import Enum
from typing import List, Optional
Expand Down Expand Up @@ -65,7 +64,9 @@ def co_annotation_report(self, disease: Optional[str] = None, enrich: bool = Fal
and field_name_1 in self._anndata.obs.columns
and field_name_2 in self._anndata.obs.columns
):
co_oc = self._filter_data_and_drop_duplicates(field_name_1, field_name_2, disease)
co_oc = self._filter_data_and_drop_duplicates(
field_name_1, field_name_2, disease
)

if enrich:
co_oc = self._enrich_co_annotation(co_oc, field_name_1, field_name_2)
Expand Down
124 changes: 84 additions & 40 deletions pandasaurus_cxg/graph_generator/graph_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,26 +135,34 @@ def enrich_rdf_graph(self):
self.graph.add((class_expression_bnode, OWL.someValuesFrom, resource))
# Add the restriction
self.graph.add((s, RDF.type, class_expression_bnode))
# self.graph.add((s, self.ns["consist_of"], resource))
# add subClassOf between terms in CL enrichment
for _, row in self.enriched_df.iterrows():
for s, _, _ in self.graph.triples((None, RDFS.label, Literal(row["s_label"]))):
for o, _, _ in self.graph.triples((None, RDFS.label, Literal(row["o_label"]))):
self.graph.add((s, RDFS.subClassOf, o))

def save_rdf_graph(self, file_name: str = "mygraph", _format: str = "xml"):
def save_rdf_graph(
self,
graph: Optional[Graph],
file_name: Optional[str] = "mygraph",
_format: Optional[str] = "xml",
):
"""
Serializes and saves the RDF graph to a file.
Args:
file_name (str, optional): The name of the output file without the extension.
graph: An optional RDF graph that will be serialized.
If provided, this graph will be used for serialization.
If not provided, the graph inside the GraphGenerator instance will be used.
file_name: The name of the output file without the extension.
Defaults to "mygraph".
_format (str, optional): The format of the RDF serialization. Defaults to "xml".
_format: The format of the RDF serialization. Defaults to "xml".
Raises:
InvalidGraphFormat: If the provided _format is not valid.
"""
graph = graph if graph else self.graph
format_extension = {
RDFFormat.RDF_XML.value: "owl",
RDFFormat.TURTLE.value: "ttl",
Expand All @@ -163,68 +171,63 @@ def save_rdf_graph(self, file_name: str = "mygraph", _format: str = "xml"):

if _format in format_extension:
file_extension = format_extension[_format]
self.graph.serialize(f"{file_name}.{file_extension}", format=_format)
graph.serialize(f"{file_name}.{file_extension}", format=_format)
else:
valid_formats = [valid_format.value for valid_format in RDFFormat]
raise InvalidGraphFormat(RDFFormat, valid_formats)

def visualize_rdf_graph(self, start_node: List[str], file_path: str):
def visualize_rdf_graph(self, start_node: List[str], predicate: str, file_path: str):
# TODO visualize all graph, with parametric annotation properties to better visualize the nodes.
# TODO apply redundancy striping to owl directly
stack = []
visited = set()
subgraph = Graph()
graph = Graph().parse(file_path, format="ttl") if file_path else self.graph
if predicate and not graph.query(f"ASK {{ ?s {self.ns[predicate].n3()} ?o }}"):
raise ValueError(f"The {self.ns[predicate]} relation does not exist in the graph")
if start_node:
for node in start_node:
if not URIRef(node) in graph.subjects():
raise ValueError(
f"None of the nodes in the list {node} exist in the RDF graph."
)
stack = [URIRef(node) for node in start_node]
visited = set()
subgraph = Graph()
stack = [URIRef(node) for node in start_node] if start_node else None

while stack:
node = stack.pop()
if node not in visited:
visited.add(node)
for subject, predicate, obj in graph.triples((node, None, None)):
if predicate != RDF.type:
# Add all outgoing edges of the current node # and not isinstance(obj, Literal)
subgraph.add((subject, predicate, obj))
for s, p, next_node in graph.triples((node, None, None)):
# stack.append(next_node)
if not isinstance(next_node, BNode):
stack.append(next_node)
else:
subgraph.add(
(
node,
next(graph.objects(next_node, OWL.onProperty)),
next(graph.objects(next_node, OWL.someValuesFrom)),
)
)
for s, p, o in graph.triples((node, self.ns[predicate] if predicate else None, None)):
# Add all outgoing edges of the current node
subgraph.add((s, p, o))
for s, p, next_node in graph.triples(
(node, self.ns[predicate] if predicate else None, None)
):
if not isinstance(next_node, BNode):
stack.append(next_node)
# else:
# subgraph.add(
# (
# node,
# next(graph.objects(next_node, OWL.onProperty)),
# next(graph.objects(next_node, OWL.someValuesFrom)),
# )
# )
# TODO not sure if we need this else or not

if not start_node:
for s, p, o in graph.triples((None, self.ns[predicate] if predicate else None, None)):
# Add all outgoing edges of the current node
subgraph.add((s, p, o))

nx_graph = nx.DiGraph()
for subject, predicate, obj in subgraph:
if isinstance(obj, URIRef) and predicate != RDF.type:
edge_data = {
"label": "is_a" if predicate == RDF.type else str(predicate).split("/")[-1]
}
edge_data = {"label": str(predicate).split("/")[-1]}
nx_graph.add_edge(
str(subject).split("/")[-1],
str(obj).split("/")[-1],
**edge_data,
)
# str([o for s, p, o in graph.triples((subject, None, None)) if isinstance(o, Literal)][-1])
# nx_graph.add_edge(
# str(subgraph.value(subject=subject, predicate=self.ns["cell_type"]))
# if subgraph.value(subject=subject, predicate=self.ns["cell_type"])
# else str(subject).split("/")[-1],
# subgraph.value(subject=obj, predicate=self.ns["cell_type"])
# if subgraph.value(subject=obj, predicate=self.ns["cell_type"])
# else str(obj).split("/")[-1],
# **edge_data,
# )

# Apply transitive reduction to remove redundancy
transitive_reduction_graph = nx.transitive_reduction(nx_graph)
Expand All @@ -237,11 +240,13 @@ def visualize_rdf_graph(self, start_node: List[str], file_path: str):
pos = nx.drawing.nx_agraph.graphviz_layout(transitive_reduction_graph, prog="dot")

# Plot the graph as a hierarchical tree
plt.figure(figsize=(10, 8))
node_labels = nx.get_node_attributes(transitive_reduction_graph, "label")
plt.figure(figsize=(10, 10))
nx.draw(
transitive_reduction_graph,
pos,
with_labels=True,
labels=node_labels,
node_size=1000,
node_color="skyblue",
font_size=8,
Expand All @@ -259,6 +264,45 @@ def visualize_rdf_graph(self, start_node: List[str], file_path: str):
)
plt.show()

def transitive_reduction(self, predicate: str, file_path: str, _format: str = "xml"):
graph = Graph().parse(file_path, format="ttl") if file_path else self.graph
if predicate and not graph.query(f"ASK {{ ?s {self.ns[predicate].n3()} ?o }}"):
raise ValueError(f"The {self.ns[predicate]} relation does not exist in the graph")

subgraph = Graph()
for s, p, o in graph.triples((None, self.ns[predicate] if predicate else None, None)):
# Add all outgoing edges of the current node
subgraph.add((s, p, o))

nx_graph = nx.DiGraph()
for subject, _predicate, obj in subgraph:
if isinstance(obj, URIRef) and _predicate != RDF.type:
edge_data = {
"label": "is_a" if _predicate == RDF.type else str(predicate).split("/")[-1]
}
nx_graph.add_edge(
str(subject).split("/")[-1],
str(obj).split("/")[-1],
**edge_data,
)

# Apply transitive reduction to remove redundancy
transitive_reduction_graph = nx.transitive_reduction(nx_graph)
transitive_reduction_graph.add_edges_from(
(u, v, nx_graph.edges[u, v]) for u, v in transitive_reduction_graph.edges
)

# Remove redundant triples using nx graph
edge_diff = list(set(nx_graph.edges) - set(transitive_reduction_graph.edges))

for edge in edge_diff:
if graph.query(
f"ASK {{ {self.ns[edge[0]].n3()} {self.ns[predicate].n3()} {self.ns[edge[1]].n3()} }}"
):
graph.remove((self.ns[edge[0]], self.ns[predicate], self.ns[edge[1]]))

self.save_rdf_graph(graph, f"{file_path.split('.')[0]}_non_redundant", _format)


class RDFFormat(Enum):
RDF_XML = "xml"
Expand Down

0 comments on commit 6f8d4fa

Please sign in to comment.