Resolved conflicts

INCATools · Aug 1, 2023 · 6f8d4fa · 6f8d4fa
2 parents 54da01d + 786c681
commit 6f8d4fa
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 42 deletions.
diff --git a/pandasaurus_cxg/anndata_analyzer.py b/pandasaurus_cxg/anndata_analyzer.py
@@ -1,4 +1,3 @@
-import itertools
 import os
 from enum import Enum
 from typing import List, Optional
@@ -65,7 +64,9 @@ def co_annotation_report(self, disease: Optional[str] = None, enrich: bool = Fal
                     and field_name_1 in self._anndata.obs.columns
                     and field_name_2 in self._anndata.obs.columns
                 ):
-                    co_oc = self._filter_data_and_drop_duplicates(field_name_1, field_name_2, disease)
+                    co_oc = self._filter_data_and_drop_duplicates(
+                        field_name_1, field_name_2, disease
+                    )
 
                     if enrich:
                         co_oc = self._enrich_co_annotation(co_oc, field_name_1, field_name_2)

diff --git a/pandasaurus_cxg/graph_generator/graph_generator.py b/pandasaurus_cxg/graph_generator/graph_generator.py
@@ -135,26 +135,34 @@ def enrich_rdf_graph(self):
                 self.graph.add((class_expression_bnode, OWL.someValuesFrom, resource))
                 # Add the restriction
                 self.graph.add((s, RDF.type, class_expression_bnode))
-                # self.graph.add((s, self.ns["consist_of"], resource))
         # add subClassOf between terms in CL enrichment
         for _, row in self.enriched_df.iterrows():
             for s, _, _ in self.graph.triples((None, RDFS.label, Literal(row["s_label"]))):
                 for o, _, _ in self.graph.triples((None, RDFS.label, Literal(row["o_label"]))):
                     self.graph.add((s, RDFS.subClassOf, o))
 
-    def save_rdf_graph(self, file_name: str = "mygraph", _format: str = "xml"):
+    def save_rdf_graph(
+        self,
+        graph: Optional[Graph],
+        file_name: Optional[str] = "mygraph",
+        _format: Optional[str] = "xml",
+    ):
         """
         Serializes and saves the RDF graph to a file.
 
         Args:
-            file_name (str, optional): The name of the output file without the extension.
+            graph: An optional RDF graph that will be serialized.
+                If provided, this graph will be used for serialization.
+                If not provided, the graph inside the GraphGenerator instance will be used.
+            file_name: The name of the output file without the extension.
                 Defaults to "mygraph".
-            _format (str, optional): The format of the RDF serialization. Defaults to "xml".
+            _format: The format of the RDF serialization. Defaults to "xml".
 
         Raises:
             InvalidGraphFormat: If the provided _format is not valid.
 
         """
+        graph = graph if graph else self.graph
         format_extension = {
             RDFFormat.RDF_XML.value: "owl",
             RDFFormat.TURTLE.value: "ttl",
@@ -163,68 +171,63 @@ def save_rdf_graph(self, file_name: str = "mygraph", _format: str = "xml"):
 
         if _format in format_extension:
             file_extension = format_extension[_format]
-            self.graph.serialize(f"{file_name}.{file_extension}", format=_format)
+            graph.serialize(f"{file_name}.{file_extension}", format=_format)
         else:
             valid_formats = [valid_format.value for valid_format in RDFFormat]
             raise InvalidGraphFormat(RDFFormat, valid_formats)
 
-    def visualize_rdf_graph(self, start_node: List[str], file_path: str):
+    def visualize_rdf_graph(self, start_node: List[str], predicate: str, file_path: str):
         # TODO visualize all graph, with parametric annotation properties to better visualize the nodes.
         # TODO apply redundancy striping to owl directly
-        stack = []
-        visited = set()
-        subgraph = Graph()
         graph = Graph().parse(file_path, format="ttl") if file_path else self.graph
+        if predicate and not graph.query(f"ASK {{ ?s {self.ns[predicate].n3()} ?o }}"):
+            raise ValueError(f"The {self.ns[predicate]} relation does not exist in the graph")
         if start_node:
             for node in start_node:
                 if not URIRef(node) in graph.subjects():
                     raise ValueError(
                         f"None of the nodes in the list {node} exist in the RDF graph."
                     )
-            stack = [URIRef(node) for node in start_node]
+        visited = set()
+        subgraph = Graph()
+        stack = [URIRef(node) for node in start_node] if start_node else None
 
         while stack:
             node = stack.pop()
             if node not in visited:
                 visited.add(node)
-                for subject, predicate, obj in graph.triples((node, None, None)):
-                    if predicate != RDF.type:
-                        # Add all outgoing edges of the current node  # and not isinstance(obj, Literal)
-                        subgraph.add((subject, predicate, obj))
-                for s, p, next_node in graph.triples((node, None, None)):
-                    # stack.append(next_node)
-                    if not isinstance(next_node, BNode):
-                        stack.append(next_node)
-                    else:
-                        subgraph.add(
-                            (
-                                node,
-                                next(graph.objects(next_node, OWL.onProperty)),
-                                next(graph.objects(next_node, OWL.someValuesFrom)),
-                            )
-                        )
+            for s, p, o in graph.triples((node, self.ns[predicate] if predicate else None, None)):
+                # Add all outgoing edges of the current node
+                subgraph.add((s, p, o))
+            for s, p, next_node in graph.triples(
+                (node, self.ns[predicate] if predicate else None, None)
+            ):
+                if not isinstance(next_node, BNode):
+                    stack.append(next_node)
+                # else:
+                #     subgraph.add(
+                #         (
+                #             node,
+                #             next(graph.objects(next_node, OWL.onProperty)),
+                #             next(graph.objects(next_node, OWL.someValuesFrom)),
+                #         )
+                #     )
+                # TODO not sure if we need this else or not
+
+        if not start_node:
+            for s, p, o in graph.triples((None, self.ns[predicate] if predicate else None, None)):
+                # Add all outgoing edges of the current node
+                subgraph.add((s, p, o))
 
         nx_graph = nx.DiGraph()
         for subject, predicate, obj in subgraph:
             if isinstance(obj, URIRef) and predicate != RDF.type:
-                edge_data = {
-                    "label": "is_a" if predicate == RDF.type else str(predicate).split("/")[-1]
-                }
+                edge_data = {"label": str(predicate).split("/")[-1]}
                 nx_graph.add_edge(
                     str(subject).split("/")[-1],
                     str(obj).split("/")[-1],
                     **edge_data,
                 )
-                # str([o for s, p, o in graph.triples((subject, None, None)) if isinstance(o, Literal)][-1])
-                # nx_graph.add_edge(
-                #     str(subgraph.value(subject=subject, predicate=self.ns["cell_type"]))
-                #     if subgraph.value(subject=subject, predicate=self.ns["cell_type"])
-                #     else str(subject).split("/")[-1],
-                #     subgraph.value(subject=obj, predicate=self.ns["cell_type"])
-                #     if subgraph.value(subject=obj, predicate=self.ns["cell_type"])
-                #     else str(obj).split("/")[-1],
-                #     **edge_data,
-                # )
 
         # Apply transitive reduction to remove redundancy
         transitive_reduction_graph = nx.transitive_reduction(nx_graph)
@@ -237,11 +240,13 @@ def visualize_rdf_graph(self, start_node: List[str], file_path: str):
         pos = nx.drawing.nx_agraph.graphviz_layout(transitive_reduction_graph, prog="dot")
 
         # Plot the graph as a hierarchical tree
-        plt.figure(figsize=(10, 8))
+        node_labels = nx.get_node_attributes(transitive_reduction_graph, "label")
+        plt.figure(figsize=(10, 10))
         nx.draw(
             transitive_reduction_graph,
             pos,
             with_labels=True,
+            labels=node_labels,
             node_size=1000,
             node_color="skyblue",
             font_size=8,
@@ -259,6 +264,45 @@ def visualize_rdf_graph(self, start_node: List[str], file_path: str):
         )
         plt.show()
 
+    def transitive_reduction(self, predicate: str, file_path: str, _format: str = "xml"):
+        graph = Graph().parse(file_path, format="ttl") if file_path else self.graph
+        if predicate and not graph.query(f"ASK {{ ?s {self.ns[predicate].n3()} ?o }}"):
+            raise ValueError(f"The {self.ns[predicate]} relation does not exist in the graph")
+
+        subgraph = Graph()
+        for s, p, o in graph.triples((None, self.ns[predicate] if predicate else None, None)):
+            # Add all outgoing edges of the current node
+            subgraph.add((s, p, o))
+
+        nx_graph = nx.DiGraph()
+        for subject, _predicate, obj in subgraph:
+            if isinstance(obj, URIRef) and _predicate != RDF.type:
+                edge_data = {
+                    "label": "is_a" if _predicate == RDF.type else str(predicate).split("/")[-1]
+                }
+                nx_graph.add_edge(
+                    str(subject).split("/")[-1],
+                    str(obj).split("/")[-1],
+                    **edge_data,
+                )
+
+        # Apply transitive reduction to remove redundancy
+        transitive_reduction_graph = nx.transitive_reduction(nx_graph)
+        transitive_reduction_graph.add_edges_from(
+            (u, v, nx_graph.edges[u, v]) for u, v in transitive_reduction_graph.edges
+        )
+
+        # Remove redundant triples using nx graph
+        edge_diff = list(set(nx_graph.edges) - set(transitive_reduction_graph.edges))
+
+        for edge in edge_diff:
+            if graph.query(
+                f"ASK {{ {self.ns[edge[0]].n3()} {self.ns[predicate].n3()} {self.ns[edge[1]].n3()} }}"
+            ):
+                graph.remove((self.ns[edge[0]], self.ns[predicate], self.ns[edge[1]]))
+
+        self.save_rdf_graph(graph, f"{file_path.split('.')[0]}_non_redundant", _format)
+
 
 class RDFFormat(Enum):
     RDF_XML = "xml"