Skip to content

Commit

Permalink
refactored pairwise aligner
Browse files Browse the repository at this point in the history
  • Loading branch information
haeussma committed May 10, 2024
1 parent db538e9 commit 04181da
Show file tree
Hide file tree
Showing 21 changed files with 1,084 additions and 292 deletions.
836 changes: 679 additions & 157 deletions examples/basics.ipynb

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyeed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@

from .core.abstractannotation import AbstractAnnotation
from .core.alignmentdata import AlignmentData
from .core.alignmentresult import AlignmentResult
from .core.annotation import Annotation
from .core.blastdata import BlastData
from .core.clustalomegadata import ClustalOmegaData
from .core.clustalomegaresult import ClustalOmegaResult
from .core.cluster import Cluster
from .core.dnarecord import DNARecord
from .core.ontology import Ontology
from .core.organism import Organism
from .core.pairwisealignment import PairwiseAlignment
from .core.pairwisealignmentresult import PairwiseAlignmentResult
from .core.proteinrecord import ProteinRecord
from .core.region import Region
from .core.sequence import Sequence
Expand Down
182 changes: 123 additions & 59 deletions pyeed/align/pairwise_aligner.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
from itertools import combinations
from typing import Dict, List, TYPE_CHECKING
from typing import Dict, List

from tqdm import tqdm
from joblib import Parallel, delayed, cpu_count
from Bio.Align import Alignment as Alignment
from Bio.Align import PairwiseAligner as BioPairwiseAligner
from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix
from joblib import Parallel, cpu_count, delayed
from rich.progress import (
BarColumn,
MofNCompleteColumn,
Progress,
TextColumn,
TimeElapsedColumn,
TimeRemainingColumn,
)

from pyeed.core.pairwisealignmentresult import PairwiseAlignmentResult

if TYPE_CHECKING:
from Bio.Align import Alignment as Alignment
from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix

class PairwiseAligner:

def __init__(self, mode: str) -> None:
self.mode = mode
self.match = 1
Expand All @@ -20,98 +26,156 @@ def __init__(self, mode: str) -> None:
self.gap_extend = 0
self.substitution_matrix = "None"

def align_pairwise(self, seq1: str, seq2: str, seq1_id: str, seq2_id: str) -> dict:
"""
This function aligns two sequences and returns the alignment results.
def _align(
self,
seq1: Dict[str, str],
seq2: Dict[str, str],
) -> Alignment:
"""Aligns two sequences and returns the alignment results.
Args:
seq1 (str): The first sequence to be aligned.
seq2 (str): The second sequence to be aligned.
seq1 (Dict[str, str]): Sequence 1 to align. Key is the sequence ID.
seq2 (Dict[str, str]): Sequence 2 to align. Key is the sequence ID.
progress (Progress, optional): `Rich` progress. Defaults to None.
task_id (TaskID, optional): `Rich` task_id. Defaults to None.
Returns:
dict: The alignment results.
Alignment: The alignment results.
"""
aligner = BioPairwiseAligner()
aligner.mode = self.mode
aligner.match_score = self.match
aligner.mismatch_score = self.mismatch
aligner.open_gap_score = self.gap_open
aligner.extend_gap_score = self.gap_extend

if self.substitution_matrix != "None":
aligner.substitution_matrix = self._load_substitution_matrix()
aligner = self._get_aligner()

alignment_result = aligner.align(seq1, seq2)
results = aligner.align(list(seq1.values())[0], list(seq2.values())[0])

alignment_result_dic = self._map_alignment_results(alignment_result[0], seq1_id, seq2_id)
return results[0]

return alignment_result_dic
def align_pairwise(
self,
seq1: Dict[str, str],
seq2: Dict[str, str],
) -> PairwiseAlignmentResult:
"""Aligns two sequences and returns the alignment results.
def align_multipairwise(self, sequences: Dict[str, str], **kwargs) -> List[dict]:
"""
Args:
seq1 (Dict[str, str]): Sequence 1 to align. Key is the sequence ID.
seq2 (Dict[str, str]): Sequence 2 to align. Key is the sequence ID.
Returns:
PairwiseAlignmentResult: The alignment results.
"""

alignment_result = self._align(seq1, seq2)

return self._map_alignment_results(alignment_result, seq1, seq2)

def align_multipairwise(self, sequences: Dict[str, str], **kwargs) -> List[dict]:
""" """
# the sequences are stored in a dictionary, key = source.id value = sequence
# this creates a list of all possible pairs of sequences, on this list the alignment has to happen
pairs = list(combinations(sequences.keys(), 2))

progress_bar = Progress(
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
BarColumn(),
MofNCompleteColumn(),
TextColumn("•"),
TimeElapsedColumn(),
TextColumn("•"),
TimeRemainingColumn(),
)

alignments = []
with progress_bar as progress:
alignments = Parallel(n_jobs=cpu_count(), prefer="processes")(
delayed(self.align_pairwise)(
{pair[0]: sequences[pair[0]]}, {pair[1]: sequences[pair[1]]}
)
for pair in progress.track(pairs, description="⛓️Aligning sequences...")
)

return alignments

def _get_aligner(self) -> BioPairwiseAligner:
"""Creates a BioPython pairwise aligner object with the specified parameters
from the class instance."""
aligner = BioPairwiseAligner()
aligner.mode = self.mode
aligner.match_score = self.match
aligner.mismatch_score = self.mismatch
aligner.open_gap_score = self.gap_open
aligner.extend_gap_score = self.gap_extend

if self.substitution_matrix != "None":
aligner.substitution_matrix = self._load_substitution_matrix()

alignments = Parallel(n_jobs=cpu_count(), prefer="processes")(
delayed(self.align_pairwise)(sequences[a[0]], sequences[a[1]], a[0], a[1])
for a in tqdm(pairs, desc="⛓️ Running pairwise alignments")
)

return alignments
return aligner

def _map_alignment_results(self, alignment: "Alignment", seq1_id, seq2_id) -> List[dict]:
def _map_alignment_results(
self, alignment: Alignment, seq1: Dict[str, str], seq2: Dict[str, str]
) -> PairwiseAlignmentResult:
# this maps the alignment results to a dictionary
# this dictionary is used to create the network graph

shorter_seq = min(alignment[0], alignment[1], key=lambda x: len(x))

identities = alignment.counts().identities
identity = identities / len(shorter_seq)
gaps = alignment.counts().gaps
mismatches = alignment.counts().mismatches

sequences = [
{"id": list(seq1.keys())[0], "sequence": list(seq1.values())[0]},
{"id": list(seq2.keys())[0], "sequence": list(seq2.values())[0]},
]

aligned_sequences = [
{"id": list(seq1.keys())[0], "sequence": alignment[0]},
{"id": list(seq2.keys())[0], "sequence": alignment[1]},
]

alignment_dic = {
"seq1": alignment[0],
"seq2": alignment[1],
"seq1_id": seq1_id,
"seq2_id": seq2_id,
result_dict = {
"score": alignment.score,
"mismatches": 1 / (alignment.counts().mismatches + 1),
"gaps": 1 / (alignment.counts().gaps + 1),
"identity": identity,
"start": alignment.aligned[0],
"end": alignment.aligned[1],
"gaps": gaps,
"mismatches": mismatches,
"sequences": sequences,
"aligned_sequences": aligned_sequences,
}


return alignment_dic
# alignment_dic = {
# "seq1": alignment[0],
# "seq2": alignment[1],
# "seq1_id": seq1_id,
# "seq2_id": seq2_id,
# "score": alignment.score,
# "mismatches": 1 / (alignment.counts().mismatches + 1),
# "gaps": 1 / (alignment.counts().gaps + 1),
# "identity": identity,
# "start": alignment.aligned[0],
# "end": alignment.aligned[1],
# }

return result_dict

def _load_substitution_matrix(self) -> "BioSubstitutionMatrix":
from Bio.Align import substitution_matrices

return substitution_matrices.load(self.substitution_matrix)



if __name__ == "__main__":
aligner = PairwiseAligner(mode="global")
seq1 = "ATCG"
seq2 = "AGTC"
seq3 = "AAAA"
seq4 = "TTTT"
sequences = {
"seq1": seq1,
"seq2": seq2,
"seq3": seq3,
"seq4": seq4
}
alignment_result = aligner.align_multipairwise(sequences=sequences)

seq1 = dict(s1="ATCG")
seq2 = dict(s2="ATCC")
seq3 = dict(s3="GGCC")
seq4 = dict(s4="GGGC")

all_seq = {**seq1, **seq2, **seq3, **seq4}

print(aligner.align_pairwise(seq1=seq1, seq2=seq2))

alignment_result = aligner.align_multipairwise(sequences=all_seq)

print(alignment_result)

print(alignment_result)
print(alignment_result)
3 changes: 3 additions & 0 deletions pyeed/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from .abstractannotation import AbstractAnnotation
from .alignmentdata import AlignmentData
from .alignmentresult import AlignmentResult
from .annotation import Annotation
from .blastdata import BlastData
from .clustalomegadata import ClustalOmegaData
from .clustalomegaresult import ClustalOmegaResult
from .cluster import Cluster
from .dnarecord import DNARecord
from .ontology import Ontology
from .organism import Organism
from .pairwisealignment import PairwiseAlignment
from .pairwisealignmentresult import PairwiseAlignmentResult
from .proteinrecord import ProteinRecord
from .region import Region
from .sequence import Sequence
Expand Down
5 changes: 0 additions & 5 deletions pyeed/core/abstractannotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,6 @@ class AbstractAnnotation(
),
)

_repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
_commit: Optional[str] = PrivateAttr(
default="a7defc5c87a2296a2e4b522b07236b2aef6413ac"
)

_raw_xml_data: Dict = PrivateAttr(default_factory=dict)

@model_validator(mode="after")
Expand Down
Loading

0 comments on commit 04181da

Please sign in to comment.