Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support co annotation analysis #9

Merged
merged 12 commits into from
Jun 23, 2023
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[flake8]
ignore = F401, E203, W503
max-line-length = 100
max-line-length = 125
152 changes: 152 additions & 0 deletions pandasaurus_cxg/anndata_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
from enum import Enum
import os
from typing import List

import pandas as pd

from pandasaurus_cxg.anndata_loader import AnndataLoader
from pandasaurus_cxg.schema.schema_loader import read_json_file


# Check if the DEBUG environment variable is set
debug_mode = os.getenv('DEBUG')

class AnndataAnalyzer:
"""
A class for providing methods for different type of analysis in an AnnData object.

Args:
file_path (str): The path to the AnnData file.
schema_path (str): The path to the schema file.

Attributes:
_anndata_obs (pd.DataFrame): The observation data from the AnnData object.
_schema (dict): The schema data loaded from the schema file.

"""

def __init__(self, file_path: str, schema_path: str):
"""
Initializes an instance of the AnndataAnalyzer class.

Args:
file_path (str): The path to the AnnData file.
schema_path (str): The path to the schema file.

"""
self._anndata_obs = AnndataLoader.load_from_file(file_path).obs
self._schema = read_json_file(schema_path)

def co_annotation_report(self):
dosumis marked this conversation as resolved.
Show resolved Hide resolved
"""
Generates a co-annotation report based on the provided schema.

Examples:
| subclass.l3, dPT, cluster_matches, subclass.full, Degenerative Proximal Tubule Epithelial Cell
| subclass.l3, aTAL1, subcluster_of, subclass.full, Adaptive / Maladaptive / Repairing Thick Ascending Limb Cell
| class, epithelial cells, cluster_matches, cell_type, kidney collecting duct intercalated cell

Returns:
pd.DataFrame: The co-annotation report.

"""
free_text_cell_type = [key for key, value in self._schema.items() if value]
temp_result = []
for field_name_2 in free_text_cell_type:
for field_name_1 in free_text_cell_type:
if (
field_name_1 != field_name_2
and field_name_1 in self._anndata_obs.columns
and field_name_2 in self._anndata_obs.columns
):
co_oc = (
self._anndata_obs[[field_name_1, field_name_2]]
.drop_duplicates()
.reset_index(drop=True)
)
field_name_2_dict = (
co_oc.groupby(field_name_2)[field_name_1].apply(list).to_dict()
)
field_name_1_dict = (
co_oc.groupby(field_name_1)[field_name_2].apply(list).to_dict()
)
co_oc["predicate"] = co_oc.apply(
self._assign_predicate,
args=(
field_name_1,
field_name_2,
field_name_1_dict,
field_name_2_dict,
debug_mode,
),
axis=1,
)

temp_result.extend(co_oc.to_dict(orient="records"))

result = [
[item for sublist in [[k, v] for k, v in record.items()] for item in sublist]
for record in temp_result
]
unique_result = self._remove_duplicates(result)
return pd.DataFrame(
[inner_list[:2] + inner_list[5:6] + inner_list[2:4] for inner_list in unique_result],
columns=["field_name1", "value1", "predicate", "field_name2", "value2"],
)


@staticmethod
def _remove_duplicates(data: List[List[str]]):
unique_data = []
unique_set = set()

for sublist in data:
if Predicate.SUPERCLUSTER_OF.value in sublist:
continue
sorted_sublist = tuple(sorted(set(sublist)))
if sorted_sublist not in unique_set:
unique_data.append(sublist)
unique_set.add(sorted_sublist)
return unique_data

@staticmethod
def _assign_predicate(
row, field_name_1, field_name_2, field_name_1_dict, field_name_2_dict, debug
):
if debug:
print("Debugging row:", row)
print("Value of field_name_1:", row[field_name_1])
print("Value of field_name_1_dict:", field_name_1_dict.get(row[field_name_1], []))
print("Value of field_name_2:", row[field_name_2])
print("Value of field_name_2_dict:", field_name_2_dict.get(row[field_name_2], []))

field_name_1_values = field_name_1_dict.get(row[field_name_1], [])
field_name_2_values = field_name_2_dict.get(row[field_name_2], [])

if field_name_2_dict.get(row[field_name_2], []) == [
row[field_name_1]
] and field_name_1_dict.get(row[field_name_1], []) == [row[field_name_2]]:
return Predicate.CLUSTER_MATCHES.value

if (
row[field_name_1] in field_name_2_values
and row[field_name_2] in field_name_1_values
and len(field_name_1_values) == 1
):
return Predicate.SUBCLUSTER_OF.value

if (
row[field_name_1] in field_name_2_values
and row[field_name_2] in field_name_1_values
and len(field_name_2_values) == 1
):
return Predicate.SUPERCLUSTER_OF.value

return Predicate.CLUSTER_OVERLAPS.value


class Predicate(Enum):
CLUSTER_MATCHES = "cluster_matches"
CLUSTER_OVERLAPS = "cluster_overlaps"
SUBCLUSTER_OF = "subcluster_of"
SUPERCLUSTER_OF = "supercluster_of"
1 change: 1 addition & 0 deletions pandasaurus_cxg/anndata_enricher.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def __init__(
The slim list is used in minimal_slim_enrichment and full_slim_enrichment.
Defaults to "Cell Ontology"
"""
# TODO Do we need to keep whole anndata? Would it be enough to keep the obs only?
self._anndata = AnndataLoader.load_from_file(file_path)
self.__seed_list = self._anndata.obs[cell_type_field].unique().tolist()
self.__enricher = Query(self.__seed_list)
Expand Down
Empty file.
61 changes: 61 additions & 0 deletions pandasaurus_cxg/schema/schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
"nCount_RNA": false,
"nFeature_RNA": false,
"library": false,
"percent.er": false,
"percent.mt": false,
"degen.score": false,
"aEpi.score": false,
"aStr.score": false,
"cyc.score": false,
"matrisome.score": false,
"collagen.score": false,
"glycoprotein.score": false,
"proteoglycan.score": false,
"S.Score": false,
"G2M.Score": false,
"experiment": false,
"specimen": false,
"condition.long": false,
"condition.l1": false,
"condition.l2": false,
"donor_id": false,
"region.l1": false,
"region.l2": false,
"percent.cortex": false,
"percent.medulla": false,
"tissue_type": false,
"id": false,
"pagoda_k100_infomap_coembed": false,
"subclass.full": true,
"subclass.l3": true,
"subclass.l2": true,
"subclass.l1": true,
"state.l2": true,
"state": true,
"class": true,
"structure": false,
"disease_ontology_term_id": false,
"sex_ontology_term_id": false,
"development_stage_ontology_term_id": false,
"self_reported_ethnicity_ontology_term_id": false,
"eGFR": false,
"BMI": false,
"diabetes_history": false,
"hypertension": false,
"tissue_ontology_term_id": false,
"organism_ontology_term_id": false,
"assay_ontology_term_id": false,
"cell_type_ontology_term_id": false,
"is_primary_data": false,
"suspension_type": false,
"cell_type": true,
"assay": false,
"disease": false,
"organism": false,
"sex": false,
"tissue": false,
"self_reported_ethnicity": false,
"development_stage": false,
"author_cell_type": true
}
7 changes: 7 additions & 0 deletions pandasaurus_cxg/schema/schema_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import json


def read_json_file(file_path):
with open(file_path, "r") as file:
json_data = json.load(file)
return json_data
Loading