INCATools · ubyndr · Jun 23, 2023 · Jun 16, 2023 · Jun 16, 2023 · Jun 16, 2023
diff --git a/pandasaurus_cxg/anndata_analyzer.py b/pandasaurus_cxg/anndata_analyzer.py
@@ -0,0 +1,50 @@
+from enum import Enum
+
+import pandas as pd
+
+from pandasaurus_cxg.anndata_loader import AnndataLoader
+from pandasaurus_cxg.schema.schema_loader import read_json_file
+
+
+class AnndataAnalyzer:
+    def __init__(self, file_path: str, schema_path: str):
+        self._anndata_obs = AnndataLoader.load_from_file(file_path).obs
+        self._schema = read_json_file(schema_path)
+
+    def co_annotation_report(self):
+        free_text_cell_type = [key for key, value in self._schema.items() if value]
+        temp_result = []
+        for text in free_text_cell_type:
+            if text in self._anndata_obs.columns:
+                co_oc = (
+                    self._anndata_obs[[text, "cell_type"]].drop_duplicates().reset_index(drop=True)
+                )
+                predicate_dict = co_oc.groupby("cell_type")[text].apply(list).to_dict()
+                co_oc["predicate"] = co_oc.apply(
+                    lambda row: Predicate.CLUSTER_MATCHES.value
+                    if row[text] in predicate_dict.get(row["cell_type"], [])
+                    and len(predicate_dict.get(row["cell_type"], [])) == 1
+                    else (
+                        Predicate.SUBCLUSTER_OF.value
+                        if row[text] in predicate_dict.get(row["cell_type"], [])
+                        else Predicate.CLUSTER_OVERLAPS.value
+                    ),  # All the other cases should be marked with 'cluster_overlaps', right?
+                    axis=1,
+                )
+                temp_result.extend(co_oc.to_dict(orient="records"))
+
+        result = [
+            [item for sublist in [[k, v] for k, v in record.items()] for item in sublist]
+            for record in temp_result
+        ]
+
+        return pd.DataFrame(
+            [inner_list[:2] + inner_list[5:6] + inner_list[2:4] for inner_list in result],
+            columns=["field_name1", "value1", "predicate", "field_name2", "value2"],
+        )
+
+
+class Predicate(Enum):
+    CLUSTER_MATCHES = "cluster_matches"
+    CLUSTER_OVERLAPS = "cluster_overlaps"
+    SUBCLUSTER_OF = "subcluster_of"
diff --git a/pandasaurus_cxg/anndata_enricher.py b/pandasaurus_cxg/anndata_enricher.py
@@ -31,6 +31,7 @@ def __init__(
                 The slim list is used in minimal_slim_enrichment and full_slim_enrichment.
                 Defaults to "Cell Ontology"
         """
+        # TODO Do we need to keep whole anndata? Would it be enough to keep the obs only?
         self._anndata = AnndataLoader.load_from_file(file_path)
         self.__seed_list = self._anndata.obs[cell_type_field].unique().tolist()
         self.__enricher = Query(self.__seed_list)

diff --git a/pandasaurus_cxg/schema/__init__.py b/pandasaurus_cxg/schema/__init__.py
diff --git a/pandasaurus_cxg/schema/schema.json b/pandasaurus_cxg/schema/schema.json
@@ -0,0 +1,61 @@
+{
+    "nCount_RNA": false,
+    "nFeature_RNA": false,
+    "library": false,
+    "percent.er": false,
+    "percent.mt": false,
+    "degen.score": false,
+    "aEpi.score": false,
+    "aStr.score": false,
+    "cyc.score": false,
+    "matrisome.score": false,
+    "collagen.score": false,
+    "glycoprotein.score": false,
+    "proteoglycan.score": false,
+    "S.Score": false,
+    "G2M.Score": false,
+    "experiment": false,
+    "specimen": false,
+    "condition.long": false,
+    "condition.l1": false,
+    "condition.l2": false,
+    "donor_id": false,
+    "region.l1": false,
+    "region.l2": false,
+    "percent.cortex": false,
+    "percent.medulla": false,
+    "tissue_type": false,
+    "id": false,
+    "pagoda_k100_infomap_coembed": false,
+    "subclass.full": true,
+    "subclass.l3": true,
+    "subclass.l2": true,
+    "subclass.l1": true,
+    "state.l2": true,
+    "state": true,
+    "class": true,
+    "structure": false,
+    "disease_ontology_term_id": false,
+    "sex_ontology_term_id": false,
+    "development_stage_ontology_term_id": false,
+    "self_reported_ethnicity_ontology_term_id": false,
+    "eGFR": false,
+    "BMI": false,
+    "diabetes_history": false,
+    "hypertension": false,
+    "tissue_ontology_term_id": false,
+    "organism_ontology_term_id": false,
+    "assay_ontology_term_id": false,
+    "cell_type_ontology_term_id": false,
+    "is_primary_data": false,
+    "suspension_type": false,
+    "cell_type": false,
+    "assay": false,
+    "disease": false,
+    "organism": false,
+    "sex": false,
+    "tissue": false,
+    "self_reported_ethnicity": false,
+    "development_stage": false,
+    "author_cell_type": true
+}
diff --git a/pandasaurus_cxg/schema/schema_loader.py b/pandasaurus_cxg/schema/schema_loader.py
@@ -0,0 +1,7 @@
+import json
+
+
+def read_json_file(file_path):
+    with open(file_path, "r") as file:
+        json_data = json.load(file)
+    return json_data