From e4ac754b8dd078490c8711f20be249e120b977c1 Mon Sep 17 00:00:00 2001
From: max <max.haeussler@gmail.com>
Date: Sat, 9 Mar 2024 09:18:50 +0100
Subject: [PATCH] renamed

---
 .../abstractfetcher.py}                       | 130 +++++++++---------
 pyeed/fetchers/ncbitaxonomy.py                |  98 +++++++++++++
 pyeed/parsers/__init__.py                     |   0
 pyeed/parsers/parser_test.ipynb               |  97 -------------
 4 files changed, 163 insertions(+), 162 deletions(-)
 rename pyeed/{parsers/abstractparser.py => fetchers/abstractfetcher.py} (76%)
 create mode 100644 pyeed/fetchers/ncbitaxonomy.py
 delete mode 100644 pyeed/parsers/__init__.py
 delete mode 100644 pyeed/parsers/parser_test.ipynb

diff --git a/pyeed/parsers/abstractparser.py b/pyeed/fetchers/abstractfetcher.py
similarity index 76%
rename from pyeed/parsers/abstractparser.py
rename to pyeed/fetchers/abstractfetcher.py
index 21e82c3f..8cc0c606 100644
--- a/pyeed/parsers/abstractparser.py
+++ b/pyeed/fetchers/abstractfetcher.py
@@ -1,6 +1,6 @@
-import os
 import re
 import logging
+import secrets
 import logging.config
 from pathlib import Path
 from abc import ABC, abstractmethod
@@ -19,33 +19,50 @@
 
 path_config = Path(__file__).parent.parent.parent / "logging.conf"
 logging.config.fileConfig(path_config)
-logger = logging.getLogger("pyeed")
+LOGGER = logging.getLogger("pyeed")
 
 
-class DataParser(ABC):
-    def __init__(self, source: Any):
-        self.source = source
+class AbstractFetcher(ABC):
+    def __init__(self, foreign_id: str):
+        super().__init__()
+        self.foreign_id = foreign_id
 
     @abstractmethod
-    def fetch_entry(self):
+    def get(self):
         pass
 
     @abstractmethod
-    def parse_organism(self):
+    def map(self, handle: Any, cls):
         pass
 
-    @abstractmethod
-    def map(self):
-        pass
+    @staticmethod
+    def get_substitute_email() -> str:
+        return f"{secrets.token_hex(8)}@gmail.com"
 
+    @staticmethod
+    def make_chunks(input_list: list, chunk_size: int = 100) -> List[list]:
+        """
+        Splits a list into chunks of a given size.
+        """
+        if input_list is None:
+            raise ValueError("input_list cannot be None.")
 
-class NCBIParser(DataParser):
+        if not isinstance(input_list, list):
+            raise TypeError("input_list must be a list")
+
+        return [
+            input_list[i : i + chunk_size]
+            for i in range(0, len(input_list), chunk_size)
+        ]
+
+
+class NCBIProteinParser(AbstractFetcher):
 
     def map(self, cls: "ProteinInfo"):
 
         protein_info = cls(source_id=self.source.id, sequence=str(self.source.seq))
 
-        protein_info.organism = self.parse_organism()
+        protein_info.organism = self.map_organism()
         protein_info = self.map_protein(protein_info)
         protein_info = self.map_regions(protein_info)
         protein_info = self.map_sites(protein_info)
@@ -53,7 +70,7 @@ def map(self, cls: "ProteinInfo"):
 
         return protein_info
 
-    def parse_organism(self) -> Organism:
+    def map_organism(self) -> Organism:
         """
         Gets the organism name and taxonomy ID from the source data.
         Maps it to an Organism object.
@@ -61,41 +78,49 @@ def parse_organism(self) -> Organism:
 
         feature = self.get_feature("source")
         if len(feature) != 1:
-            logger.debug(
+            LOGGER.debug(
                 f"Multiple features ({len(feature)}) of type `source` found for {self.source.id}: {feature}"
             )
         feature = feature[0]
 
         try:
-            taxonomy_id = feature.qualifiers["db_xref"]
+            if len(feature.qualifiers["db_xref"]) != 1:
+                LOGGER.info(
+                    f"For {self.source.id} {feature.qualifiers['db_xref']} taxonomy ID(s) were found, using the first one. Skipping organism assignment"
+                )
+                return None
+
+            taxonomy_id = feature.qualifiers["db_xref"][0]
+
+            if ":" in taxonomy_id:
+                taxonomy_id = int(taxonomy_id.split(":")[1])
+
         except KeyError:
-            logger.debug(
-                f"No taxonomy ID found for {self.source.id}: {feature[0].qualifiers}"
-            )
-            taxonomy_id = None
+            LOGGER.debug(f"No taxonomy ID found for {self.source.id}: {feature}")
+            return None
 
         try:
             organism_name = feature.qualifiers["organism"]
         except KeyError:
-            logger.debug(
+            LOGGER.debug(
                 f"No organism name found for {self.source.id}: {feature[0].qualifiers}"
             )
             organism_name = None
 
-        return Organism(name=organism_name[0], taxonomy_id=taxonomy_id[0])
+        return Organism(name=organism_name[0], taxonomy_id=taxonomy_id)
 
     def map_protein(self, protein_info: ProteinInfo):
 
         protein = self.get_feature("Protein")
         if len(protein) == 0:
-            logger.debug(
+            LOGGER.debug(
                 f"No protein feature found for {self.source.id}: {self.source.features}"
             )
 
             return protein_info
 
         if len(protein) > 1:
-            logger.debug(
+            LOGGER.debug(
                 f"Multiple features ({len(protein)}) of type `Protein` found for {self.source.id}"
             )
 
@@ -103,13 +128,13 @@ def map_protein(self, protein_info: ProteinInfo):
         try:
             protein_info.name = protein.qualifiers["product"][0]
         except KeyError:
-            logger.debug(
+            LOGGER.debug(
                 f"No protein name found for {self.source.id}: {protein.qualifiers}"
             )
             try:
                 protein_info.name = protein.qualifiers["name"][0]
             except KeyError:
-                logger.debug(
+                LOGGER.debug(
                     f"No protein name found for {self.source.id}: {protein.qualifiers}"
                 )
                 protein_info.name = None
@@ -117,7 +142,7 @@ def map_protein(self, protein_info: ProteinInfo):
         try:
             protein_info.mol_weight = protein.qualifiers["calculated_mol_wt"][0]
         except KeyError:
-            logger.debug(
+            LOGGER.debug(
                 f"No molecular weight found for {self.source.id}: {protein.qualifiers}"
             )
             protein_info.mol_weight = None
@@ -125,7 +150,7 @@ def map_protein(self, protein_info: ProteinInfo):
         try:
             protein_info.ec_number = protein.qualifiers["EC_number"][0]
         except KeyError:
-            logger.debug(
+            LOGGER.debug(
                 f"No EC number found for {self.source.id}: {protein.qualifiers}"
             )
             protein_info.ec_number = None
@@ -151,7 +176,7 @@ def map_regions(self, protein_info: ProteinInfo):
                     )
                 )
             except KeyError:
-                logger.debug(
+                LOGGER.debug(
                     f"Incomplete region data found for {self.source.id}: {region.qualifiers}, skipping region"
                 )
 
@@ -171,7 +196,7 @@ def map_sites(self, protein_info: ProteinInfo):
                     cross_ref=site.qualifiers["db_xref"][0],
                 )
             except KeyError:
-                logger.warning(
+                LOGGER.warning(
                     f"Incomplete site data found for {self.source.id}: {site.qualifiers}, skipping site"
                 )
 
@@ -181,14 +206,14 @@ def map_cds(self, protein_info: ProteinInfo):
 
         cds = self.get_feature("CDS")
         if len(cds) > 1:
-            logger.info(
+            LOGGER.info(
                 f"Multiple features ({len(cds)}) of type `CDS` found for {self.source.id}"
             )
 
         try:
             cds = cds[0]
         except IndexError:
-            logger.debug(f"No CDS found for {self.source.id}: {cds}")
+            LOGGER.debug(f"No CDS found for {self.source.id}: {cds}")
 
             return protein_info
 
@@ -197,7 +222,7 @@ def map_cds(self, protein_info: ProteinInfo):
                 cds.qualifiers["coded_by"][0]
             )
         except IndexError:
-            logger.debug(
+            LOGGER.debug(
                 f"No coding sequence reference found for {self.source.id}: {cds.qualifiers}"
             )
 
@@ -220,7 +245,7 @@ def get_cds_regions(coded_by: dict) -> List[DNARegion]:
         if not all(
             [reference_id == reference_ids[0] for reference_id in reference_ids]
         ):
-            logger.warning(
+            LOGGER.warning(
                 "Nucleotide sequence references are not identical: {reference_ids}"
             )
 
@@ -238,14 +263,6 @@ def get_cds_regions(coded_by: dict) -> List[DNARegion]:
 
         return region
 
-    def fetch_entry(self, identifier: str):
-        # Implementation for fetching data from NCBI
-        logger.debug(f"Fetching NCBI data for {identifier}")
-
-    def parse_data(self, data):
-        # Implementation for parsing NCBI data
-        logger.debug("Parsing NCBI data")
-
     def get_feature(self, feature_type: str) -> "Bio.SeqFeature.SeqFeature":
         return [
             feature
@@ -254,34 +271,17 @@ def get_feature(self, feature_type: str) -> "Bio.SeqFeature.SeqFeature":
         ]
 
 
-class UniProtParser(DataParser):
-
-    def parse_organism():
-        pass
-
-    def fetch_entry(self, identifier: str):
-        # Implementation for fetching data from UniProt
-        pass
+class UniProtParser(AbstractFetcher):
 
-    def parse_data(self, data):
-        # Implementation for parsing UniProt data
+    def map():
         pass
 
 
-class ParserFactory:
-    @staticmethod
-    def get_parser(source: str) -> DataParser:
-        parsers = {"NCBI": NCBIParser(), "UniProt": UniProtParser()}
-        parser = parsers.get(source.upper())
-        if not parser:
-            raise ValueError(f"Parser for {source} not found.")
-        return parser
-
-
 if __name__ == "__main__":
-    from pyeed.ncbi.seq_io import get_ncbi_entry
+    from pyeed.ncbi.seq_io import get_ncbi_entry, get_ncbi_taxonomy
+    from pyeed.core import Organism
 
-    entry = get_ncbi_entry("7P82_A", "protein")
+    entry = get_ncbi_taxonomy("311400")
 
-    parser = NCBIParser(entry)
-    print(parser.map(ProteinInfo))
+    parser = NCBITaxonomyParser(entry[0])
+    print(parser.map(Organism))
diff --git a/pyeed/fetchers/ncbitaxonomy.py b/pyeed/fetchers/ncbitaxonomy.py
new file mode 100644
index 00000000..a77fc99d
--- /dev/null
+++ b/pyeed/fetchers/ncbitaxonomy.py
@@ -0,0 +1,98 @@
+from typing import Generator, List
+from Bio import Entrez
+from tqdm import tqdm
+
+from pyeed.core.organism import Organism
+from pyeed.fetchers.abstractfetcher import AbstractFetcher
+from pyeed.fetchers.abstractfetcher import LOGGER
+
+
+class NCBITaxonomyParser(AbstractFetcher):
+
+    def __init__(self, foreign_id: List[str], email: str = None, api_key: str = None):
+        super().__init__(foreign_id)
+        self.api_key = api_key
+        if email is None:
+            self.email = self.get_substitute_email()
+
+    def get(self):
+
+        if isinstance(self.foreign_id, list):
+            return list(self.get_multiple_ids())
+        else:
+            return list(self.get_single_id())
+
+    def make_request(self, request_string: str) -> Generator:
+        Entrez.email = self.email
+        Entrez.api_key = self.api_key
+
+        with Entrez.efetch(
+            db="taxonomy",
+            id=request_string,
+            retmode="xml",
+            api_key=self.api_key,
+        ) as handle:
+            yield handle
+
+    def get_single_id(self):
+        handle = next(self.make_request(self.foreign_id))
+        return Entrez.read(handle)
+
+    def get_multiple_ids(self):
+        with tqdm(
+            total=len(self.foreign_id),
+            desc="⬇️ Fetching taxonomy data",
+        ) as pbar:
+
+            for chunk in self.make_chunks(self.foreign_id):
+                request_string = ",".join(chunk)
+
+                for handle in self.make_request(request_string):
+
+                    pbar.update(1)
+                    yield Entrez.read(handle)
+
+    def map(self, cls: "Organism"):
+
+        tax_id = self.source.get("TaxId")
+        organism = cls(taxonomy_id=tax_id)
+
+        organism.name = self.source.get("ScientificName")
+        organism.species = self.source.get("ScientificName")
+
+        lineage = self.source.get("LineageEx")
+
+        if not lineage:
+            LOGGER.debug(f"No lineage found for {tax_id}: {self.source}")
+            return organism
+
+        for tax_rank in lineage:
+            if tax_rank.get("Rank") == "superkingdom":
+                organism.domain = tax_rank.get("ScientificName")
+            elif tax_rank.get("Rank") == "phylum":
+                organism.phylum = tax_rank.get("ScientificName")
+            elif tax_rank.get("Rank") == "class":
+                organism.tax_class = tax_rank.get("ScientificName")
+            elif tax_rank.get("Rank") == "order":
+                organism.order = tax_rank.get("ScientificName")
+            elif tax_rank.get("Rank") == "family":
+                organism.family = tax_rank.get("ScientificName")
+            elif tax_rank.get("Rank") == "genus":
+                organism.genus = tax_rank.get("ScientificName")
+            elif tax_rank.get("Rank") == "species":
+                organism.species = tax_rank.get("ScientificName")
+            elif tax_rank.get("Rank") == "kingdom":
+                organism.kingdom = tax_rank.get("ScientificName")
+            else:
+                continue
+
+        return organism
+
+
+if __name__ == "__main__":
+    single_tax_id = "9606"
+    multiple_tax_ids = ["9606"]
+
+    # print(NCBITaxonomyParser(single_tax_id).get())
+
+    print(NCBITaxonomyParser(multiple_tax_ids).get())
diff --git a/pyeed/parsers/__init__.py b/pyeed/parsers/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/pyeed/parsers/parser_test.ipynb b/pyeed/parsers/parser_test.ipynb
deleted file mode 100644
index 24ef0af7..00000000
--- a/pyeed/parsers/parser_test.ipynb
+++ /dev/null
@@ -1,97 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%reload_ext autoreload\n",
-    "%autoreload 2\n",
-    "from abstractparser import NCBIParser"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-03-06 16:27:50,249 — abstractparser — WARNING — fetch_entry:36 — Warning\n",
-      "2024-03-06 16:27:50,249 — abstractparser — WARNING — fetch_entry:36 — Warning\n",
-      "WARNING:abstractparser:Warning\n"
-     ]
-    }
-   ],
-   "source": [
-    "NCBIParser().fetch_entry(\"some_identifier\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-03-06 16:27:50,268 — abstractparser — WARNING — fetch_entry:36 — Warning\n",
-      "2024-03-06 16:27:50,268 — abstractparser — WARNING — fetch_entry:36 — Warning\n",
-      "WARNING:abstractparser:Warning\n"
-     ]
-    }
-   ],
-   "source": [
-    "NCBIParser().fetch_entry(\"some_identifier\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "TypeError",
-     "evalue": "'NoneType' object is not subscriptable",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[12], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m omg \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(a\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, b\u001b[38;5;241m=\u001b[39m[\u001b[38;5;241m2\u001b[39m], c\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m \u001b[43momg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mx\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n",
-      "\u001b[0;31mTypeError\u001b[0m: 'NoneType' object is not subscriptable"
-     ]
-    }
-   ],
-   "source": [
-    "omg = dict(a=1, b=[2], c=3)\n",
-    "\n",
-    "omg.get(\"x\")[0]"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "pye",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}