Skip to content

Commit

Permalink
added filter function to identify uniprot sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
haeussma committed Apr 12, 2024
1 parent 5d46adc commit a4bd2bd
Showing 1 changed file with 21 additions and 1 deletion.
22 changes: 21 additions & 1 deletion pyeed/core/proteininfo.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import os
from typing import List, Optional
import warnings
Expand Down Expand Up @@ -232,15 +233,34 @@ def ncbi_blastp(
blast_record = NCBIXML.read(result_handle)

accessions = self._get_accessions(blast_record)
uniprot_accessions = self._filter_uniprot_accessions(accessions)
ncbi_accessions = list(set(accessions) - set(uniprot_accessions))

protein_infos = NCBIProteinFetcher(
foreign_id=accessions, api_key=api_key
foreign_id=ncbi_accessions, api_key=api_key
).fetch(ProteinInfo)
protein_infos.insert(0, self)

if uniprot_accessions:
from pyeed.fetchers.uniprotfetcher import UniprotFetcher

uniprot_proteins = UniprotFetcher(foreign_id=uniprot_accessions).fetch()
protein_infos.extend(uniprot_proteins)

print("🎉 Done\n")
return protein_infos

def _filter_uniprot_accessions(self, accessions: List[str]) -> List[str]:
uniprot_pattern = re.compile(
r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
)

return [
uniprot_pattern.match(acc)[0]
for acc in accessions
if uniprot_pattern.match(acc)
]

def blastp(
self,
db_path: str,
Expand Down

0 comments on commit a4bd2bd

Please sign in to comment.