From e4ac754b8dd078490c8711f20be249e120b977c1 Mon Sep 17 00:00:00 2001 From: max Date: Sat, 9 Mar 2024 09:18:50 +0100 Subject: [PATCH] renamed --- .../abstractfetcher.py} | 130 +++++++++--------- pyeed/fetchers/ncbitaxonomy.py | 98 +++++++++++++ pyeed/parsers/__init__.py | 0 pyeed/parsers/parser_test.ipynb | 97 ------------- 4 files changed, 163 insertions(+), 162 deletions(-) rename pyeed/{parsers/abstractparser.py => fetchers/abstractfetcher.py} (76%) create mode 100644 pyeed/fetchers/ncbitaxonomy.py delete mode 100644 pyeed/parsers/__init__.py delete mode 100644 pyeed/parsers/parser_test.ipynb diff --git a/pyeed/parsers/abstractparser.py b/pyeed/fetchers/abstractfetcher.py similarity index 76% rename from pyeed/parsers/abstractparser.py rename to pyeed/fetchers/abstractfetcher.py index 21e82c3f..8cc0c606 100644 --- a/pyeed/parsers/abstractparser.py +++ b/pyeed/fetchers/abstractfetcher.py @@ -1,6 +1,6 @@ -import os import re import logging +import secrets import logging.config from pathlib import Path from abc import ABC, abstractmethod @@ -19,33 +19,50 @@ path_config = Path(__file__).parent.parent.parent / "logging.conf" logging.config.fileConfig(path_config) -logger = logging.getLogger("pyeed") +LOGGER = logging.getLogger("pyeed") -class DataParser(ABC): - def __init__(self, source: Any): - self.source = source +class AbstractFetcher(ABC): + def __init__(self, foreign_id: str): + super().__init__() + self.foreign_id = foreign_id @abstractmethod - def fetch_entry(self): + def get(self): pass @abstractmethod - def parse_organism(self): + def map(self, handle: Any, cls): pass - @abstractmethod - def map(self): - pass + @staticmethod + def get_substitute_email() -> str: + return f"{secrets.token_hex(8)}@gmail.com" + @staticmethod + def make_chunks(input_list: list, chunk_size: int = 100) -> List[list]: + """ + Splits a list into chunks of a given size. + """ + if input_list is None: + raise ValueError("input_list cannot be None.") -class NCBIParser(DataParser): + if not isinstance(input_list, list): + raise TypeError("input_list must be a list") + + return [ + input_list[i : i + chunk_size] + for i in range(0, len(input_list), chunk_size) + ] + + +class NCBIProteinParser(AbstractFetcher): def map(self, cls: "ProteinInfo"): protein_info = cls(source_id=self.source.id, sequence=str(self.source.seq)) - protein_info.organism = self.parse_organism() + protein_info.organism = self.map_organism() protein_info = self.map_protein(protein_info) protein_info = self.map_regions(protein_info) protein_info = self.map_sites(protein_info) @@ -53,7 +70,7 @@ def map(self, cls: "ProteinInfo"): return protein_info - def parse_organism(self) -> Organism: + def map_organism(self) -> Organism: """ Gets the organism name and taxonomy ID from the source data. Maps it to an Organism object. @@ -61,41 +78,49 @@ def parse_organism(self) -> Organism: feature = self.get_feature("source") if len(feature) != 1: - logger.debug( + LOGGER.debug( f"Multiple features ({len(feature)}) of type `source` found for {self.source.id}: {feature}" ) feature = feature[0] try: - taxonomy_id = feature.qualifiers["db_xref"] + if len(feature.qualifiers["db_xref"]) != 1: + LOGGER.info( + f"For {self.source.id} {feature.qualifiers['db_xref']} taxonomy ID(s) were found, using the first one. Skipping organism assignment" + ) + return None + + taxonomy_id = feature.qualifiers["db_xref"][0] + + if ":" in taxonomy_id: + taxonomy_id = int(taxonomy_id.split(":")[1]) + except KeyError: - logger.debug( - f"No taxonomy ID found for {self.source.id}: {feature[0].qualifiers}" - ) - taxonomy_id = None + LOGGER.debug(f"No taxonomy ID found for {self.source.id}: {feature}") + return None try: organism_name = feature.qualifiers["organism"] except KeyError: - logger.debug( + LOGGER.debug( f"No organism name found for {self.source.id}: {feature[0].qualifiers}" ) organism_name = None - return Organism(name=organism_name[0], taxonomy_id=taxonomy_id[0]) + return Organism(name=organism_name[0], taxonomy_id=taxonomy_id) def map_protein(self, protein_info: ProteinInfo): protein = self.get_feature("Protein") if len(protein) == 0: - logger.debug( + LOGGER.debug( f"No protein feature found for {self.source.id}: {self.source.features}" ) return protein_info if len(protein) > 1: - logger.debug( + LOGGER.debug( f"Multiple features ({len(protein)}) of type `Protein` found for {self.source.id}" ) @@ -103,13 +128,13 @@ def map_protein(self, protein_info: ProteinInfo): try: protein_info.name = protein.qualifiers["product"][0] except KeyError: - logger.debug( + LOGGER.debug( f"No protein name found for {self.source.id}: {protein.qualifiers}" ) try: protein_info.name = protein.qualifiers["name"][0] except KeyError: - logger.debug( + LOGGER.debug( f"No protein name found for {self.source.id}: {protein.qualifiers}" ) protein_info.name = None @@ -117,7 +142,7 @@ def map_protein(self, protein_info: ProteinInfo): try: protein_info.mol_weight = protein.qualifiers["calculated_mol_wt"][0] except KeyError: - logger.debug( + LOGGER.debug( f"No molecular weight found for {self.source.id}: {protein.qualifiers}" ) protein_info.mol_weight = None @@ -125,7 +150,7 @@ def map_protein(self, protein_info: ProteinInfo): try: protein_info.ec_number = protein.qualifiers["EC_number"][0] except KeyError: - logger.debug( + LOGGER.debug( f"No EC number found for {self.source.id}: {protein.qualifiers}" ) protein_info.ec_number = None @@ -151,7 +176,7 @@ def map_regions(self, protein_info: ProteinInfo): ) ) except KeyError: - logger.debug( + LOGGER.debug( f"Incomplete region data found for {self.source.id}: {region.qualifiers}, skipping region" ) @@ -171,7 +196,7 @@ def map_sites(self, protein_info: ProteinInfo): cross_ref=site.qualifiers["db_xref"][0], ) except KeyError: - logger.warning( + LOGGER.warning( f"Incomplete site data found for {self.source.id}: {site.qualifiers}, skipping site" ) @@ -181,14 +206,14 @@ def map_cds(self, protein_info: ProteinInfo): cds = self.get_feature("CDS") if len(cds) > 1: - logger.info( + LOGGER.info( f"Multiple features ({len(cds)}) of type `CDS` found for {self.source.id}" ) try: cds = cds[0] except IndexError: - logger.debug(f"No CDS found for {self.source.id}: {cds}") + LOGGER.debug(f"No CDS found for {self.source.id}: {cds}") return protein_info @@ -197,7 +222,7 @@ def map_cds(self, protein_info: ProteinInfo): cds.qualifiers["coded_by"][0] ) except IndexError: - logger.debug( + LOGGER.debug( f"No coding sequence reference found for {self.source.id}: {cds.qualifiers}" ) @@ -220,7 +245,7 @@ def get_cds_regions(coded_by: dict) -> List[DNARegion]: if not all( [reference_id == reference_ids[0] for reference_id in reference_ids] ): - logger.warning( + LOGGER.warning( "Nucleotide sequence references are not identical: {reference_ids}" ) @@ -238,14 +263,6 @@ def get_cds_regions(coded_by: dict) -> List[DNARegion]: return region - def fetch_entry(self, identifier: str): - # Implementation for fetching data from NCBI - logger.debug(f"Fetching NCBI data for {identifier}") - - def parse_data(self, data): - # Implementation for parsing NCBI data - logger.debug("Parsing NCBI data") - def get_feature(self, feature_type: str) -> "Bio.SeqFeature.SeqFeature": return [ feature @@ -254,34 +271,17 @@ def get_feature(self, feature_type: str) -> "Bio.SeqFeature.SeqFeature": ] -class UniProtParser(DataParser): - - def parse_organism(): - pass - - def fetch_entry(self, identifier: str): - # Implementation for fetching data from UniProt - pass +class UniProtParser(AbstractFetcher): - def parse_data(self, data): - # Implementation for parsing UniProt data + def map(): pass -class ParserFactory: - @staticmethod - def get_parser(source: str) -> DataParser: - parsers = {"NCBI": NCBIParser(), "UniProt": UniProtParser()} - parser = parsers.get(source.upper()) - if not parser: - raise ValueError(f"Parser for {source} not found.") - return parser - - if __name__ == "__main__": - from pyeed.ncbi.seq_io import get_ncbi_entry + from pyeed.ncbi.seq_io import get_ncbi_entry, get_ncbi_taxonomy + from pyeed.core import Organism - entry = get_ncbi_entry("7P82_A", "protein") + entry = get_ncbi_taxonomy("311400") - parser = NCBIParser(entry) - print(parser.map(ProteinInfo)) + parser = NCBITaxonomyParser(entry[0]) + print(parser.map(Organism)) diff --git a/pyeed/fetchers/ncbitaxonomy.py b/pyeed/fetchers/ncbitaxonomy.py new file mode 100644 index 00000000..a77fc99d --- /dev/null +++ b/pyeed/fetchers/ncbitaxonomy.py @@ -0,0 +1,98 @@ +from typing import Generator, List +from Bio import Entrez +from tqdm import tqdm + +from pyeed.core.organism import Organism +from pyeed.fetchers.abstractfetcher import AbstractFetcher +from pyeed.fetchers.abstractfetcher import LOGGER + + +class NCBITaxonomyParser(AbstractFetcher): + + def __init__(self, foreign_id: List[str], email: str = None, api_key: str = None): + super().__init__(foreign_id) + self.api_key = api_key + if email is None: + self.email = self.get_substitute_email() + + def get(self): + + if isinstance(self.foreign_id, list): + return list(self.get_multiple_ids()) + else: + return list(self.get_single_id()) + + def make_request(self, request_string: str) -> Generator: + Entrez.email = self.email + Entrez.api_key = self.api_key + + with Entrez.efetch( + db="taxonomy", + id=request_string, + retmode="xml", + api_key=self.api_key, + ) as handle: + yield handle + + def get_single_id(self): + handle = next(self.make_request(self.foreign_id)) + return Entrez.read(handle) + + def get_multiple_ids(self): + with tqdm( + total=len(self.foreign_id), + desc="⬇️ Fetching taxonomy data", + ) as pbar: + + for chunk in self.make_chunks(self.foreign_id): + request_string = ",".join(chunk) + + for handle in self.make_request(request_string): + + pbar.update(1) + yield Entrez.read(handle) + + def map(self, cls: "Organism"): + + tax_id = self.source.get("TaxId") + organism = cls(taxonomy_id=tax_id) + + organism.name = self.source.get("ScientificName") + organism.species = self.source.get("ScientificName") + + lineage = self.source.get("LineageEx") + + if not lineage: + LOGGER.debug(f"No lineage found for {tax_id}: {self.source}") + return organism + + for tax_rank in lineage: + if tax_rank.get("Rank") == "superkingdom": + organism.domain = tax_rank.get("ScientificName") + elif tax_rank.get("Rank") == "phylum": + organism.phylum = tax_rank.get("ScientificName") + elif tax_rank.get("Rank") == "class": + organism.tax_class = tax_rank.get("ScientificName") + elif tax_rank.get("Rank") == "order": + organism.order = tax_rank.get("ScientificName") + elif tax_rank.get("Rank") == "family": + organism.family = tax_rank.get("ScientificName") + elif tax_rank.get("Rank") == "genus": + organism.genus = tax_rank.get("ScientificName") + elif tax_rank.get("Rank") == "species": + organism.species = tax_rank.get("ScientificName") + elif tax_rank.get("Rank") == "kingdom": + organism.kingdom = tax_rank.get("ScientificName") + else: + continue + + return organism + + +if __name__ == "__main__": + single_tax_id = "9606" + multiple_tax_ids = ["9606"] + + # print(NCBITaxonomyParser(single_tax_id).get()) + + print(NCBITaxonomyParser(multiple_tax_ids).get()) diff --git a/pyeed/parsers/__init__.py b/pyeed/parsers/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pyeed/parsers/parser_test.ipynb b/pyeed/parsers/parser_test.ipynb deleted file mode 100644 index 24ef0af7..00000000 --- a/pyeed/parsers/parser_test.ipynb +++ /dev/null @@ -1,97 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "%reload_ext autoreload\n", - "%autoreload 2\n", - "from abstractparser import NCBIParser" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-03-06 16:27:50,249 — abstractparser — WARNING — fetch_entry:36 — Warning\n", - "2024-03-06 16:27:50,249 — abstractparser — WARNING — fetch_entry:36 — Warning\n", - "WARNING:abstractparser:Warning\n" - ] - } - ], - "source": [ - "NCBIParser().fetch_entry(\"some_identifier\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-03-06 16:27:50,268 — abstractparser — WARNING — fetch_entry:36 — Warning\n", - "2024-03-06 16:27:50,268 — abstractparser — WARNING — fetch_entry:36 — Warning\n", - "WARNING:abstractparser:Warning\n" - ] - } - ], - "source": [ - "NCBIParser().fetch_entry(\"some_identifier\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "'NoneType' object is not subscriptable", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[12], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m omg \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(a\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, b\u001b[38;5;241m=\u001b[39m[\u001b[38;5;241m2\u001b[39m], c\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m \u001b[43momg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mx\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n", - "\u001b[0;31mTypeError\u001b[0m: 'NoneType' object is not subscriptable" - ] - } - ], - "source": [ - "omg = dict(a=1, b=[2], c=3)\n", - "\n", - "omg.get(\"x\")[0]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pye", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}