From 2167c32342846c1b97619faebd494be398eee0ef Mon Sep 17 00:00:00 2001 From: Ismail Ugur Bayindir Date: Mon, 18 Sep 2023 09:48:36 +0100 Subject: [PATCH] Provide term names for contextual enrichment terms (#45) * Format changes * Added pygraphviz * Refactored context_list usage and added get_context_list method * Version has been updated to 0.1.3 --- pandasaurus_cxg/anndata_enricher.py | 39 ++++++++++++++++++++--------- pandasaurus_cxg/utils/exceptions.py | 4 +-- pyproject.toml | 2 +- 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/pandasaurus_cxg/anndata_enricher.py b/pandasaurus_cxg/anndata_enricher.py index b36dbbc..1538aab 100644 --- a/pandasaurus_cxg/anndata_enricher.py +++ b/pandasaurus_cxg/anndata_enricher.py @@ -1,5 +1,5 @@ import itertools -from typing import List, Optional +from typing import List, Optional, Tuple import pandas as pd from anndata import AnnData @@ -23,6 +23,7 @@ def __init__( anndata: AnnData, cell_type_field: Optional[str] = "cell_type_ontology_term_id", context_field: Optional[str] = "tissue_ontology_term_id", + context_field_label: Optional[str] = "tissue", ontology_list_for_slims: Optional[List[str]] = None, ): """Initialize the AnndataEnricher instance with AnnData object. @@ -42,12 +43,13 @@ def __init__( ontology_list_for_slims = ["Cell Ontology"] # TODO Do we need to keep whole anndata? Would it be enough to keep the obs only? self._anndata = anndata - self.__seed_list = self._anndata.obs[cell_type_field].unique().tolist() - self.enricher = Query(self.__seed_list) - self.__context_list = ( + self._seed_list = self._anndata.obs[cell_type_field].unique().tolist() + self.enricher = Query(self._seed_list) + unique_context = self._anndata.obs[[context_field, context_field_label]].drop_duplicates() + self._context_list = ( None if context_field not in self._anndata.obs.keys() - else self._anndata.obs[context_field].unique().tolist() + else dict(zip(unique_context[context_field], unique_context[context_field_label])) ) self.slim_list = [ slim @@ -125,8 +127,8 @@ def contextual_slim_enrichment(self) -> Optional[pd.DataFrame]: """ # TODO Better handle datasets without tissue field return ( - self.enricher.contextual_slim_enrichment(self.__context_list) - if self.__context_list + self.enricher.contextual_slim_enrichment(list(self._context_list.keys())) + if self._context_list else None ) @@ -211,7 +213,7 @@ def set_enricher_property_list(self, property_list: List[str]): Args: property_list (List[str]): The list of properties to include in the enrichment analysis. """ - self.enricher = Query(self.__seed_list, property_list) + self.enricher = Query(self._seed_list, property_list) def validate_slim_list(self, slim_list): """Check if any slim term in the given list is invalid. @@ -230,11 +232,14 @@ def validate_slim_list(self, slim_list): raise InvalidSlimName(invalid_slim_list, self.slim_list) def get_seed_list(self): - return self.__seed_list + return self._seed_list def get_anndata(self): return self._anndata + def get_context_list(self): + return self._context_list + def create_cell_type_dict(self): # TODO Add empty dataframe exception return ( @@ -253,7 +258,17 @@ def create_cell_type_dict(self): .to_dict() ) - def check_subclass_relationships(self, cell_type_list: List[str]): + def check_subclass_relationships(self, cell_type_list: List[str]) -> List[Tuple[str, str]]: + """ + Check for subclass relationships between cell type ontology terms. + + Args: + cell_type_list: A list of cell type ontology term IDs to be used + for cell type annotation. + + Returns: + A list of cell type pairs that have a subClassOf relationship between them. + """ # TODO Add empty dataframe exception subclass_relation = [] for s, o in itertools.combinations(cell_type_list, 2): @@ -262,12 +277,12 @@ def check_subclass_relationships(self, cell_type_list: List[str]): & (self.enricher.enriched_df["p"] == "rdfs:subClassOf") & (self.enricher.enriched_df["o"] == o) ].empty: - subclass_relation.append([s, o]) + subclass_relation.append((s, o)) if not self.enricher.enriched_df[ (self.enricher.enriched_df["s"] == o) & (self.enricher.enriched_df["p"] == "rdfs:subClassOf") & (self.enricher.enriched_df["o"] == s) ].empty: - subclass_relation.append([o, s]) + subclass_relation.append((o, s)) return subclass_relation diff --git a/pandasaurus_cxg/utils/exceptions.py b/pandasaurus_cxg/utils/exceptions.py index fe0dac3..bc13478 100644 --- a/pandasaurus_cxg/utils/exceptions.py +++ b/pandasaurus_cxg/utils/exceptions.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Tuple class InvalidSlimName(Exception): @@ -40,7 +40,7 @@ def __init__(self, enrichment_methods: List[str]): class SubclassWarning(Exception): - def __init__(self, relation: List[List[str]]): + def __init__(self, relation: List[Tuple[str, str]]): joined_relations = ", ".join(["-".join(rel) for rel in relation]) self.message = ( f"The following cell type terms are related with subClassOf relation. " diff --git a/pyproject.toml b/pyproject.toml index 89d3e1c..c447c92 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pandasaurus-cxg" -version = "0.1.2" +version = "0.1.3" description = "Ontology enrichment tool for CxG standard AnnData files." authors = ["Ugur Bayindir "] license = "http://www.apache.org/licenses/LICENSE-2.0"