INCATools · ubyndr · Jun 23, 2023 · Jun 16, 2023 · Jun 16, 2023 · Jun 16, 2023
diff --git a/.flake8 b/.flake8
@@ -1,3 +1,3 @@
 [flake8]
 ignore = F401, E203, W503
-max-line-length = 100
+max-line-length = 125
diff --git a/pandasaurus_cxg/anndata_analyzer.py b/pandasaurus_cxg/anndata_analyzer.py
@@ -0,0 +1,152 @@
+from enum import Enum
+import os
+from typing import List
+
+import pandas as pd
+
+from pandasaurus_cxg.anndata_loader import AnndataLoader
+from pandasaurus_cxg.schema.schema_loader import read_json_file
+
+
+# Check if the DEBUG environment variable is set
+debug_mode = os.getenv('DEBUG')
+
+class AnndataAnalyzer:
+    """
+    A class for providing methods for different type of analysis in an AnnData object.
+
+    Args:
+        file_path (str): The path to the AnnData file.
+        schema_path (str): The path to the schema file.
+
+    Attributes:
+        _anndata_obs (pd.DataFrame): The observation data from the AnnData object.
+        _schema (dict): The schema data loaded from the schema file.
+
+    """
+
+    def __init__(self, file_path: str, schema_path: str):
+        """
+        Initializes an instance of the AnndataAnalyzer class.
+
+        Args:
+            file_path (str): The path to the AnnData file.
+            schema_path (str): The path to the schema file.
+
+        """
+        self._anndata_obs = AnndataLoader.load_from_file(file_path).obs
+        self._schema = read_json_file(schema_path)
+
+    def co_annotation_report(self):
+        """
+        Generates a co-annotation report based on the provided schema.
+
+         Examples:
+            | subclass.l3, dPT, cluster_matches, subclass.full, Degenerative Proximal Tubule Epithelial Cell
+            | subclass.l3, aTAL1, subcluster_of, subclass.full, Adaptive / Maladaptive / Repairing Thick Ascending Limb Cell
+            | class, epithelial cells, cluster_matches, cell_type, kidney collecting duct intercalated cell
+
+        Returns:
+            pd.DataFrame: The co-annotation report.
+
+        """
+        free_text_cell_type = [key for key, value in self._schema.items() if value]
+        temp_result = []
+        for field_name_2 in free_text_cell_type:
+            for field_name_1 in free_text_cell_type:
+                if (
+                    field_name_1 != field_name_2
+                    and field_name_1 in self._anndata_obs.columns
+                    and field_name_2 in self._anndata_obs.columns
+                ):
+                    co_oc = (
+                        self._anndata_obs[[field_name_1, field_name_2]]
+                        .drop_duplicates()
+                        .reset_index(drop=True)
+                    )
+                    field_name_2_dict = (
+                        co_oc.groupby(field_name_2)[field_name_1].apply(list).to_dict()
+                    )
+                    field_name_1_dict = (
+                        co_oc.groupby(field_name_1)[field_name_2].apply(list).to_dict()
+                    )
+                    co_oc["predicate"] = co_oc.apply(
+                        self._assign_predicate,
+                        args=(
+                            field_name_1,
+                            field_name_2,
+                            field_name_1_dict,
+                            field_name_2_dict,
+                            debug_mode,
+                        ),
+                        axis=1,
+                    )
+
+                    temp_result.extend(co_oc.to_dict(orient="records"))
+
+        result = [
+            [item for sublist in [[k, v] for k, v in record.items()] for item in sublist]
+            for record in temp_result
+        ]
+        unique_result = self._remove_duplicates(result)
+        return pd.DataFrame(
+            [inner_list[:2] + inner_list[5:6] + inner_list[2:4] for inner_list in unique_result],
+            columns=["field_name1", "value1", "predicate", "field_name2", "value2"],
+        )
+
+
+    @staticmethod
+    def _remove_duplicates(data: List[List[str]]):
+        unique_data = []
+        unique_set = set()
+
+        for sublist in data:
+            if Predicate.SUPERCLUSTER_OF.value in sublist:
+                continue
+            sorted_sublist = tuple(sorted(set(sublist)))
+            if sorted_sublist not in unique_set:
+                unique_data.append(sublist)
+                unique_set.add(sorted_sublist)
+        return unique_data
+
+    @staticmethod
+    def _assign_predicate(
+        row, field_name_1, field_name_2, field_name_1_dict, field_name_2_dict, debug
+    ):
+        if debug:
+            print("Debugging row:", row)
+            print("Value of field_name_1:", row[field_name_1])
+            print("Value of field_name_1_dict:", field_name_1_dict.get(row[field_name_1], []))
+            print("Value of field_name_2:", row[field_name_2])
+            print("Value of field_name_2_dict:", field_name_2_dict.get(row[field_name_2], []))
+
+        field_name_1_values = field_name_1_dict.get(row[field_name_1], [])
+        field_name_2_values = field_name_2_dict.get(row[field_name_2], [])
+
+        if field_name_2_dict.get(row[field_name_2], []) == [
+            row[field_name_1]
+        ] and field_name_1_dict.get(row[field_name_1], []) == [row[field_name_2]]:
+            return Predicate.CLUSTER_MATCHES.value
+
+        if (
+            row[field_name_1] in field_name_2_values
+            and row[field_name_2] in field_name_1_values
+            and len(field_name_1_values) == 1
+        ):
+            return Predicate.SUBCLUSTER_OF.value
+
+        if (
+            row[field_name_1] in field_name_2_values
+            and row[field_name_2] in field_name_1_values
+            and len(field_name_2_values) == 1
+        ):
+            return Predicate.SUPERCLUSTER_OF.value
+
+        return Predicate.CLUSTER_OVERLAPS.value
+
+
+class Predicate(Enum):
+    CLUSTER_MATCHES = "cluster_matches"
+    CLUSTER_OVERLAPS = "cluster_overlaps"
+    SUBCLUSTER_OF = "subcluster_of"
+    SUPERCLUSTER_OF = "supercluster_of"
diff --git a/pandasaurus_cxg/anndata_enricher.py b/pandasaurus_cxg/anndata_enricher.py
@@ -31,6 +31,7 @@ def __init__(
                 The slim list is used in minimal_slim_enrichment and full_slim_enrichment.
                 Defaults to "Cell Ontology"
         """
+        # TODO Do we need to keep whole anndata? Would it be enough to keep the obs only?
         self._anndata = AnndataLoader.load_from_file(file_path)
         self.__seed_list = self._anndata.obs[cell_type_field].unique().tolist()
         self.__enricher = Query(self.__seed_list)

diff --git a/pandasaurus_cxg/schema/__init__.py b/pandasaurus_cxg/schema/__init__.py
diff --git a/pandasaurus_cxg/schema/schema.json b/pandasaurus_cxg/schema/schema.json
@@ -0,0 +1,61 @@
+{
+    "nCount_RNA": false,
+    "nFeature_RNA": false,
+    "library": false,
+    "percent.er": false,
+    "percent.mt": false,
+    "degen.score": false,
+    "aEpi.score": false,
+    "aStr.score": false,
+    "cyc.score": false,
+    "matrisome.score": false,
+    "collagen.score": false,
+    "glycoprotein.score": false,
+    "proteoglycan.score": false,
+    "S.Score": false,
+    "G2M.Score": false,
+    "experiment": false,
+    "specimen": false,
+    "condition.long": false,
+    "condition.l1": false,
+    "condition.l2": false,
+    "donor_id": false,
+    "region.l1": false,
+    "region.l2": false,
+    "percent.cortex": false,
+    "percent.medulla": false,
+    "tissue_type": false,
+    "id": false,
+    "pagoda_k100_infomap_coembed": false,
+    "subclass.full": true,
+    "subclass.l3": true,
+    "subclass.l2": true,
+    "subclass.l1": true,
+    "state.l2": true,
+    "state": true,
+    "class": true,
+    "structure": false,
+    "disease_ontology_term_id": false,
+    "sex_ontology_term_id": false,
+    "development_stage_ontology_term_id": false,
+    "self_reported_ethnicity_ontology_term_id": false,
+    "eGFR": false,
+    "BMI": false,
+    "diabetes_history": false,
+    "hypertension": false,
+    "tissue_ontology_term_id": false,
+    "organism_ontology_term_id": false,
+    "assay_ontology_term_id": false,
+    "cell_type_ontology_term_id": false,
+    "is_primary_data": false,
+    "suspension_type": false,
+    "cell_type": true,
+    "assay": false,
+    "disease": false,
+    "organism": false,
+    "sex": false,
+    "tissue": false,
+    "self_reported_ethnicity": false,
+    "development_stage": false,
+    "author_cell_type": true
+}
diff --git a/pandasaurus_cxg/schema/schema_loader.py b/pandasaurus_cxg/schema/schema_loader.py
@@ -0,0 +1,7 @@
+import json
+
+
+def read_json_file(file_path):
+    with open(file_path, "r") as file:
+        json_data = json.load(file)
+    return json_data