From a4bd2bd5ab1be85845cefe813844277847de14de Mon Sep 17 00:00:00 2001 From: max Date: Fri, 12 Apr 2024 11:32:45 +0200 Subject: [PATCH] added filter function to identify uniprot sequences --- pyeed/core/proteininfo.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/pyeed/core/proteininfo.py b/pyeed/core/proteininfo.py index 67f9815e..fe462a8a 100644 --- a/pyeed/core/proteininfo.py +++ b/pyeed/core/proteininfo.py @@ -1,3 +1,4 @@ +import re import os from typing import List, Optional import warnings @@ -232,15 +233,34 @@ def ncbi_blastp( blast_record = NCBIXML.read(result_handle) accessions = self._get_accessions(blast_record) + uniprot_accessions = self._filter_uniprot_accessions(accessions) + ncbi_accessions = list(set(accessions) - set(uniprot_accessions)) protein_infos = NCBIProteinFetcher( - foreign_id=accessions, api_key=api_key + foreign_id=ncbi_accessions, api_key=api_key ).fetch(ProteinInfo) protein_infos.insert(0, self) + if uniprot_accessions: + from pyeed.fetchers.uniprotfetcher import UniprotFetcher + + uniprot_proteins = UniprotFetcher(foreign_id=uniprot_accessions).fetch() + protein_infos.extend(uniprot_proteins) + print("🎉 Done\n") return protein_infos + def _filter_uniprot_accessions(self, accessions: List[str]) -> List[str]: + uniprot_pattern = re.compile( + r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}" + ) + + return [ + uniprot_pattern.match(acc)[0] + for acc in accessions + if uniprot_pattern.match(acc) + ] + def blastp( self, db_path: str,