Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate graphs using cell sets as unifying concept #24

Merged
merged 32 commits into from
Aug 8, 2023
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
f115893
Initial commit
ubyndr Jul 7, 2023
8bbb94c
Refactor
ubyndr Jul 7, 2023
8c3ef0e
Refactored anndata_enricher.py
ubyndr Jul 14, 2023
735c33f
Added docstring to _assign_predicate and refactored _remove_duplicates
ubyndr Jul 14, 2023
14f1b9f
Implemented missing methods in graph_generator.py
ubyndr Jul 14, 2023
7410482
Added InvalidGraphFormat and MissingEnrichmentProcess exceptions
ubyndr Jul 14, 2023
607c9b2
Updated .gitignore
ubyndr Jul 14, 2023
aca0f73
Added CellType to nodes that represent CL terms
ubyndr Jul 18, 2023
9438edc
Added pygraphviz dependency
ubyndr Jul 21, 2023
0e7248f
Refactored visualize_rdf_graph method
ubyndr Jul 21, 2023
ee69211
Added consists_of relations as OWL.Restriction
ubyndr Jul 21, 2023
95c926a
Format changes
ubyndr Jul 21, 2023
dda2fa2
Refactored visualize_rdf_graph method
ubyndr Jul 25, 2023
bb04f1e
Updated walkthrough.ipynb
ubyndr Jul 25, 2023
4117cd4
Merge branch 'main' into 13-generate-graphs-using-cell-sets-as-unifyi…
Jul 25, 2023
7698ed5
Updated .gitignore
ubyndr Jul 27, 2023
1c3fb91
Merged from main
ubyndr Jul 28, 2023
4684594
Updated anndata_analyzer.py
ubyndr Jul 28, 2023
45312ca
Removed state and state.l2 from free-text annotations
ubyndr Jul 28, 2023
54da01d
Refactored visualize_rdf_graph method
ubyndr Jul 28, 2023
786c681
Added transitive_reduction method (#29)
Aug 1, 2023
6f8d4fa
Resolved conflicts
ubyndr Aug 1, 2023
625b237
Refactored cell_type_dict initialization
ubyndr Aug 2, 2023
0ad1b4e
Revert "Refactored cell_type_dict initialization"
ubyndr Aug 2, 2023
a38e12d
Refactored cell_type_dict initialization
ubyndr Aug 2, 2023
227f137
Added oaklib
ubyndr Aug 4, 2023
ecb8b88
Added logging to transitive_reduction and refactored visualize_rdf_gr…
ubyndr Aug 4, 2023
7930f81
Refactored edge_data generation
ubyndr Aug 4, 2023
dc71801
Fixed issues in nx_graph generation
ubyndr Aug 4, 2023
b5427b7
Refactored logging configuration, and added add_label_to_terms method
ubyndr Aug 8, 2023
026071e
Added add_node method
ubyndr Aug 8, 2023
f7108a9
Merge branch 'main' into 13-generate-graphs-using-cell-sets-as-unifyi…
Aug 8, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,12 @@ dmypy.json
# Pyre type checker
.pyre/

# Ontology files
*.owl
*.ttl
*.nt

# pycharm/mac
.DS_Store
.idea/
.DS_Store
.DS_Store
65 changes: 25 additions & 40 deletions pandasaurus_cxg/anndata_analyzer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from enum import Enum
import itertools
import os
from enum import Enum
from typing import List, Optional

import pandas as pd
Expand All @@ -9,7 +8,6 @@
from pandasaurus_cxg.anndata_loader import AnndataLoader
from pandasaurus_cxg.schema.schema_loader import read_json_file


# Check if the DEBUG environment variable is set
debug_mode = os.getenv("DEBUG")

Expand Down Expand Up @@ -59,20 +57,22 @@ def co_annotation_report(self, disease: Optional[str] = None, enrich: bool = Fal
"""
free_text_cell_type = [key for key, value in self._schema.items() if value]
temp_result = []
cell_type_combinations = list(itertools.combinations(free_text_cell_type, 2))
for combination in cell_type_combinations:
field_name_1 = combination[1]
field_name_2 = combination[0]
if (field_name_1 in self._anndata.obs.columns
and field_name_2 in self._anndata.obs.columns
):
co_oc = self._filter_data_and_remove_duplicates(field_name_1, field_name_2, disease)

if enrich:
co_oc = self._enrich_co_annotation(co_oc, field_name_1, field_name_2)

AnndataAnalyzer._assign_predicate_column(co_oc, field_name_1, field_name_2)
temp_result.extend(co_oc.to_dict(orient="records"))
for field_name_2 in free_text_cell_type:
for field_name_1 in free_text_cell_type:
if (
field_name_1 != field_name_2
and field_name_1 in self._anndata.obs.columns
and field_name_2 in self._anndata.obs.columns
):
co_oc = self._filter_data_and_drop_duplicates(
field_name_1, field_name_2, disease
)

if enrich:
co_oc = self._enrich_co_annotation(co_oc, field_name_1, field_name_2)

AnndataAnalyzer._assign_predicate_column(co_oc, field_name_1, field_name_2)
temp_result.extend(co_oc.to_dict(orient="records"))

result = [
[item for sublist in [[k, v] for k, v in record.items()] for item in sublist]
Expand All @@ -84,19 +84,6 @@ def co_annotation_report(self, disease: Optional[str] = None, enrich: bool = Fal
columns=["field_name1", "value1", "predicate", "field_name2", "value2"],
)

def _filter_data_and_remove_duplicates(self, field_name_1, field_name_2, disease):
# Filter the data based on the disease condition
co_oc = (
self._anndata.obs[
(self._anndata.obs["disease_ontology_term_id"].str.lower() == disease.lower())
][[field_name_1, field_name_2]]
if disease
else self._anndata.obs[[field_name_1, field_name_2]]
)
# Drop duplicates
co_oc = co_oc.drop_duplicates().reset_index(drop=True)
return co_oc

def enriched_co_annotation_report(self, disease: Optional[str] = None):
"""
Generates an enriched co-annotation report based on the provided schema. The enrichment
Expand All @@ -121,13 +108,13 @@ def enriched_co_annotation_report(self, disease: Optional[str] = None):
def _enrich_co_annotation(self, co_oc, field_name_1, field_name_2):
enricher = AnndataEnricher(self._anndata)
simple = enricher.simple_enrichment()
df = simple[simple["o"].isin(enricher.get_seed_list())][
["s_label", "o_label"]
].rename(columns={"s_label": field_name_1, "o_label": field_name_2})
df = simple[simple["o"].isin(enricher.get_seed_list())][["s_label", "o_label"]].rename(
columns={"s_label": field_name_1, "o_label": field_name_2}
)
co_oc = pd.concat([co_oc, df], axis=0).reset_index(drop=True)
return co_oc

def _filter_data_and_remove_duplicates(self, field_name_1, field_name_2, disease):
def _filter_data_and_drop_duplicates(self, field_name_1, field_name_2, disease):
# Filter the data based on the disease condition
co_oc = (
self._anndata.obs[
Expand All @@ -141,17 +128,15 @@ def _filter_data_and_remove_duplicates(self, field_name_1, field_name_2, disease
return co_oc

@staticmethod
def _remove_duplicates(data: List[List[str]]) -> List[List[str]]:
def _remove_duplicates(data: List[List[str]]):
# TODO do a clean up/rename if it is necessary
# Currently used only to clean up supercluster_of relations
unique_data = []
unique_set = set()

for sublist in data:
if Predicate.SUPERCLUSTER_OF.value in sublist:
continue
sorted_sublist = tuple(sorted(set(sublist)))
if sorted_sublist not in unique_set:
unique_data.append(sublist)
unique_set.add(sorted_sublist)
unique_data.append(sublist)
return unique_data

@staticmethod
Expand Down
29 changes: 24 additions & 5 deletions pandasaurus_cxg/anndata_enricher.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import List, Optional

from anndata import AnnData
import pandas as pd
from anndata import AnnData
from pandasaurus.query import Query
from pandasaurus.slim_manager import SlimManager

Expand All @@ -18,9 +18,9 @@ def __init__(
anndata: AnnData,
cell_type_field: Optional[str] = "cell_type_ontology_term_id",
context_field: Optional[str] = "tissue_ontology_term_id",
ontology_list_for_slims: Optional[List[str]] = ["Cell Ontology"],
ontology_list_for_slims: Optional[List[str]] = None,
):
"""Initialize the AnndataEnricher instance.
"""Initialize the AnndataEnricher instance with AnnData object.

Args:

Expand All @@ -33,8 +33,9 @@ def __init__(
The slim list is used in minimal_slim_enrichment and full_slim_enrichment.
Defaults to "Cell Ontology"
"""
if ontology_list_for_slims is None:
ontology_list_for_slims = ["Cell Ontology"]
# TODO Do we need to keep whole anndata? Would it be enough to keep the obs only?
# file_path: The path to the file containing the anndata object.
self._anndata = anndata
self.__seed_list = self._anndata.obs[cell_type_field].unique().tolist()
self.__enricher = Query(self.__seed_list)
Expand All @@ -55,8 +56,23 @@ def from_file_path(
file_path: str,
cell_type_field: Optional[str] = "cell_type_ontology_term_id",
context_field: Optional[str] = "tissue_ontology_term_id",
ontology_list_for_slims: Optional[List[str]] = ["Cell Ontology"],
ontology_list_for_slims: Optional[List[str]] = None,
):
"""Initialize the AnndataEnricher instance with file path.

Args:

file_path: The path to the file containing the anndata object.
cell_type_field: The cell type information in the anndata object.
Defaults to "cell_type_ontology_term_id".
context_field: The context information in the anndata object.
Defaults to "tissue_ontology_term_id".
ontology_list_for_slims: The ontology list for generating the slim list.
The slim list is used in minimal_slim_enrichment and full_slim_enrichment.
Defaults to "Cell Ontology"
"""
if ontology_list_for_slims is None:
ontology_list_for_slims = ["Cell Ontology"]
return AnndataEnricher(
AnndataLoader.load_from_file(file_path),
cell_type_field,
Expand Down Expand Up @@ -152,3 +168,6 @@ def validate_slim_list(self, slim_list):

def get_seed_list(self):
return self.__seed_list

def get_anndata(self):
return self._anndata
Empty file.
Loading