Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support co annotation analysis #9

Merged
merged 12 commits into from
Jun 23, 2023
50 changes: 50 additions & 0 deletions pandasaurus_cxg/anndata_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from enum import Enum

import pandas as pd

from pandasaurus_cxg.anndata_loader import AnndataLoader
from pandasaurus_cxg.schema.schema_loader import read_json_file


class AnndataAnalyzer:
def __init__(self, file_path: str, schema_path: str):
self._anndata_obs = AnndataLoader.load_from_file(file_path).obs
self._schema = read_json_file(schema_path)

def co_annotation_report(self):
dosumis marked this conversation as resolved.
Show resolved Hide resolved
free_text_cell_type = [key for key, value in self._schema.items() if value]
temp_result = []
for text in free_text_cell_type:
if text in self._anndata_obs.columns:
co_oc = (
self._anndata_obs[[text, "cell_type"]].drop_duplicates().reset_index(drop=True)
)
predicate_dict = co_oc.groupby("cell_type")[text].apply(list).to_dict()
co_oc["predicate"] = co_oc.apply(
lambda row: Predicate.CLUSTER_MATCHES.value
if row[text] in predicate_dict.get(row["cell_type"], [])
and len(predicate_dict.get(row["cell_type"], [])) == 1
else (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bit opaque. I think this works, but looks like this only tests co-annotation with cell_type field. Should be looking at all fields.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All fields as in tissue, diseases and organism etc. or just all other free text cell type field?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just cell type fields

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All combinations of pairs free text and ontology cell type fields.

Predicate.SUBCLUSTER_OF.value
if row[text] in predicate_dict.get(row["cell_type"], [])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the if/else logic - this looks like adds SUBCLUSTEROF if there are multiple rows with "cell_type" and text. But that could be a subcluster_of relationship in either direction or overlap.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me try to explain my reasoning behind this;
Lets say we have the following structure in predicate_dict:

{
'endothelial cell': ['Descending Vasa Recta Endothelial Cell', 'Ascending Vasa Recta Endothelial Cell', 'Afferent / Efferent Arteriole Endothelial Cell', 'Peritubular Capilary Endothelial Cell ', 'Glomerular Capillary Endothelial Cell', 'Degenerative Peritubular Capilary Endothelial Cell', 'Cycling Endothelial Cell', 'Lymphatic Endothelial Cell', 'Degenerative Endothelial Cell'], 
'podocyte': ['Podocyte', 'Degenerative Podocyte'], 
'leukocyte': ['Natural Killer Cell / Natural Killer T Cell', 'M2 Macrophage', 'Neutrophil', 'Monocyte-derived Cell', 'T Cell', 'Plasma Cell', 'Cycling Mononuclear Phagocyte', 'Non-classical Monocyte', 'Classical Dendritic Cell', 'Mast Cell', 'B Cell', 'Plasmacytoid Dendritic Cell', 'Cycling Natural Killer Cell / Natural Killer T Cell']
}

I iterate through the df I have and lets say the first row[text] is 'Descending Vasa Recta Endothelial Cell' and it corresponds to 'endothelial cell' in the cell type field. I check if 'Descending Vasa Recta Endothelial Cell' is in ['Descending Vasa Recta Endothelial Cell', 'Ascending Vasa Recta Endothelial Cell', 'Afferent / Efferent Arteriole Endothelial Cell', 'Peritubular Capilary Endothelial Cell ', 'Glomerular Capillary Endothelial Cell', 'Degenerative Peritubular Capilary Endothelial Cell', 'Cycling Endothelial Cell', 'Lymphatic Endothelial Cell', 'Degenerative Endothelial Cell']. Since the length of the list is not 1 I infer 'Descending Vasa Recta Endothelial Cell' as subcluster_of 'endothelial cell'

Is there any way to determine the direction of this relationship with the tabular data?

I assumed that everything other than cluster_matches and subcluster_of should be cluster_overlaps, not sure for 100%.

else Predicate.CLUSTER_OVERLAPS.value
), # All the other cases should be marked with 'cluster_overlaps', right?
axis=1,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is wrong. I will add a cluster_overlaps example to the ticket.

)
temp_result.extend(co_oc.to_dict(orient="records"))

result = [
[item for sublist in [[k, v] for k, v in record.items()] for item in sublist]
for record in temp_result
]

return pd.DataFrame(
[inner_list[:2] + inner_list[5:6] + inner_list[2:4] for inner_list in result],
columns=["field_name1", "value1", "predicate", "field_name2", "value2"],
)


class Predicate(Enum):
CLUSTER_MATCHES = "cluster_matches"
CLUSTER_OVERLAPS = "cluster_overlaps"
SUBCLUSTER_OF = "subcluster_of"
1 change: 1 addition & 0 deletions pandasaurus_cxg/anndata_enricher.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def __init__(
The slim list is used in minimal_slim_enrichment and full_slim_enrichment.
Defaults to "Cell Ontology"
"""
# TODO Do we need to keep whole anndata? Would it be enough to keep the obs only?
self._anndata = AnndataLoader.load_from_file(file_path)
self.__seed_list = self._anndata.obs[cell_type_field].unique().tolist()
self.__enricher = Query(self.__seed_list)
Expand Down
Empty file.
61 changes: 61 additions & 0 deletions pandasaurus_cxg/schema/schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
"nCount_RNA": false,
"nFeature_RNA": false,
"library": false,
"percent.er": false,
"percent.mt": false,
"degen.score": false,
"aEpi.score": false,
"aStr.score": false,
"cyc.score": false,
"matrisome.score": false,
"collagen.score": false,
"glycoprotein.score": false,
"proteoglycan.score": false,
"S.Score": false,
"G2M.Score": false,
"experiment": false,
"specimen": false,
"condition.long": false,
"condition.l1": false,
"condition.l2": false,
"donor_id": false,
"region.l1": false,
"region.l2": false,
"percent.cortex": false,
"percent.medulla": false,
"tissue_type": false,
"id": false,
"pagoda_k100_infomap_coembed": false,
"subclass.full": true,
"subclass.l3": true,
"subclass.l2": true,
"subclass.l1": true,
"state.l2": true,
"state": true,
"class": true,
"structure": false,
"disease_ontology_term_id": false,
"sex_ontology_term_id": false,
"development_stage_ontology_term_id": false,
"self_reported_ethnicity_ontology_term_id": false,
"eGFR": false,
"BMI": false,
"diabetes_history": false,
"hypertension": false,
"tissue_ontology_term_id": false,
"organism_ontology_term_id": false,
"assay_ontology_term_id": false,
"cell_type_ontology_term_id": false,
"is_primary_data": false,
"suspension_type": false,
"cell_type": false,
"assay": false,
"disease": false,
"organism": false,
"sex": false,
"tissue": false,
"self_reported_ethnicity": false,
"development_stage": false,
"author_cell_type": true
}
7 changes: 7 additions & 0 deletions pandasaurus_cxg/schema/schema_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import json


def read_json_file(file_path):
with open(file_path, "r") as file:
json_data = json.load(file)
return json_data