Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixed alignment and blast search #80

Merged
merged 2 commits into from
May 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyeed/core/abstractannotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class AbstractAnnotation(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/alignmentresult.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class AlignmentResult(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/blastdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class BlastData(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/clustalomegaresult.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class ClustalOmegaResult(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/dnarecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class DNARecord(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/organism.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class Organism(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/pairwisealignmentresult.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class PairwiseAlignmentResult(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
20 changes: 10 additions & 10 deletions pyeed/core/proteinrecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
from lxml.etree import _Element
from pydantic import PrivateAttr, model_validator
from pydantic_xml import attr, element
from rich.status import Console, Status
from rich.console import Console
from rich.status import Status
from sdRDM.base.listplus import ListPlus
from sdRDM.tools.utils import elem2dict

Expand Down Expand Up @@ -74,7 +75,7 @@ class ProteinRecord(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down Expand Up @@ -144,7 +145,6 @@ def get_id(cls, protein_id: str) -> "ProteinRecord":

import nest_asyncio

from pyeed.fetch.proteinfetcher import ProteinFetcher

nest_asyncio.apply()

Expand All @@ -166,7 +166,6 @@ def get_ids(cls, accession_ids: List[str]) -> List["ProteinRecord"]:

import nest_asyncio

from pyeed.fetch.proteinfetcher import ProteinFetcher

nest_asyncio.apply()

Expand Down Expand Up @@ -203,7 +202,6 @@ def from_sequence(
"""

from pyeed.fetch.blast import BlastProgram
from pyeed.fetch.proteinfetcher import ProteinFetcher

nest_asyncio.apply()

Expand Down Expand Up @@ -244,7 +242,7 @@ def ncbi_blast(
self,
n_hits: int,
e_value: float = 10.0,
database: str = "nr",
db: str = "swissprot",
matrix: str = "BLOSUM62",
identity: float = 0.0,
**kwargs,
Expand All @@ -255,7 +253,7 @@ def ncbi_blast(
Args:
n_hits (int): The number of hits to retrieve.
e_value (float, optional): The maximum E-value threshold for reporting hits. Defaults to 10.0.
database (str, optional): The database to search against. Defaults to "nr".
db (str, optional): The database to search against. Defaults to "swissprot".
matrix (str, optional): The substitution matrix to use. Defaults to "BLOSUM62".
identity (float, optional): The minimum sequence identity threshold for reporting hits. Defaults to 0.0.
**kwargs: Additional keyword arguments.
Expand All @@ -277,7 +275,9 @@ def ncbi_blast(

nest_asyncio.apply()

assert database in NCBIDataBase
assert (
db in NCBIDataBase
), f"Database needs to be one of {NCBIDataBase.__members__.keys()}"

program = BlastProgram.BLASTP.value
executor = ThreadPoolExecutor(max_workers=1)
Expand All @@ -292,12 +292,12 @@ def ncbi_blast(
with Status(
"Running BLAST", console=Console(force_terminal=False, force_jupyter=True)
):
result = asyncio.run(blaster.async_run(database, program, executor))
result = asyncio.run(blaster.async_run(db, program, executor))
clear_output()

accessions = blaster.extract_accession(result)

return asyncio.run(ProteinFetcher(ids=accessions).fetch(force_terminal=False))
return self.get_ids(accessions)

# def blastp(
# self,
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/region.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class Region(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/regionset.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class RegionSet(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_object_terms: Set[str] = PrivateAttr(
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class Sequence(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/sequencerecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class SequenceRecord(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class Site(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
2 changes: 1 addition & 1 deletion pyeed/core/standardnumbering.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class StandardNumbering(

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="ff1fb2064e9efbdf71ccde1d8f08b9af434150bb"
default="63f43b11e0d359e1d0a1f541cea25dd484ad0072"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)
Expand Down
8 changes: 3 additions & 5 deletions pyeed/fetch/blast.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
from concurrent.futures import ThreadPoolExecutor
from enum import Enum, EnumMeta
from typing import List
from typing import List, Optional

from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Blast.Record import Blast as BlastRecord
Expand Down Expand Up @@ -96,11 +96,9 @@ def run(self, program: str, ncbi_db: str) -> io.StringIO:
async def async_run(
self,
ncbi_db: str,
program: str = BlastProgram.BLASTP.value,
foreign_executor: ThreadPoolExecutor = None,
program: str,
foreign_executor: Optional[ThreadPoolExecutor] = None,
) -> io.StringIO:
assert program in BlastProgram
assert ncbi_db in NCBIDataBase

if not foreign_executor:
executor = ThreadPoolExecutor()
Expand Down
34 changes: 23 additions & 11 deletions pyeed/fetch/ncbiproteinmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
import re
from typing import TYPE_CHECKING, List

from Bio import SeqFeature, SeqIO
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature
from Bio.SeqRecord import SeqRecord
from pydantic import ValidationError

from pyeed.core.annotation import Annotation
from pyeed.core.dnarecord import DNARecord
Expand Down Expand Up @@ -44,9 +46,16 @@ def map(self, responses: List[str]) -> List[ProteinRecord]:

protein_infos = []
for record in seq_records:

protein_info = ProteinRecord(id=record.id, sequence=str(record.seq))

protein_info.organism = Organism(**self.map_organism(record))
try:
protein_info.organism = Organism(**self.map_organism(record))
except ValidationError as e:
LOGGER.error(
f"Error mapping organism for {record.id}: {e.errors()} {e.json()}"
)
continue

protein_info = self.map_protein(record, protein_info)

Expand All @@ -67,7 +76,7 @@ def map_organism(self, seq_record: SeqRecord) -> dict:
"""

feature = self.get_feature(seq_record, "source")
if len(feature) != 1:
if len(feature) < 1:
LOGGER.debug(
f"Multiple features ({len(feature)}) of type `source` found for {seq_record.id}: {feature}"
)
Expand All @@ -78,26 +87,29 @@ def map_organism(self, seq_record: SeqRecord) -> dict:
LOGGER.info(
f"For {seq_record.id} {feature.qualifiers['db_xref']} taxonomy ID(s) were found, using the first one. Skipping organism assignment"
)
return None
return {}

taxonomy_id = feature.qualifiers["db_xref"][0]
try:
taxonomy_id = next(feature for feature in feature.qualifiers["db_xref"] if "taxon" in feature)
if ":" in taxonomy_id:
taxonomy_id = taxonomy_id.split(":")[1]
except StopIteration:
taxonomy_id = None

if ":" in taxonomy_id:
taxonomy_id = int(taxonomy_id.split(":")[1])

except KeyError:
LOGGER.debug(f"No taxonomy ID found for {seq_record.id}: {feature}")
return None
return {}

try:
organism_name = feature.qualifiers["organism"]
except KeyError:
LOGGER.debug(
f"No organism name found for {seq_record.id}: {feature[0].qualifiers}"
)
organism_name = None
organism_name = ""

return {"name": organism_name[0], "taxonomy_id": taxonomy_id}
return {"id": taxonomy_id, "name": organism_name[0], "taxonomy_id": taxonomy_id}

def map_protein(self, seq_record: SeqRecord, protein_info: ProteinRecord):
"""Maps protein data from a `Bio.SeqRecord` to a `ProteinInfo` object."""
Expand Down Expand Up @@ -259,7 +271,7 @@ def get_cds_regions(coded_by: dict) -> List[DNARecord]:

return regions

def get_feature(self, seq_record: SeqRecord, feature_type: str) -> SeqFeature:
def get_feature(self, seq_record: SeqRecord, feature_type: str) -> List[SeqFeature]:
"""Returns a list of features of a given type from a `Bio.SeqRecord` object."""
return [
feature
Expand Down
2 changes: 1 addition & 1 deletion pyeed/fetch/requester.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ async def send_request(self, args: RequestArgs) -> str:
url = args.url

LOGGER.debug(f"Sending request to {url}")
response = await client.get(url, timeout=30)
response = await client.get(url, timeout=120)

LOGGER.debug(f"Received response from {url}. Code: {response.status_code}")

Expand Down