refactored pairwise aligner

PyEED · May 10, 2024 · 04181da · 04181da
1 parent db538e9
commit 04181da
Show file tree

Hide file tree

Showing 21 changed files with 1,084 additions and 292 deletions.
diff --git a/examples/basics.ipynb b/examples/basics.ipynb
diff --git a/pyeed/__init__.py b/pyeed/__init__.py
@@ -2,14 +2,17 @@
 
 from .core.abstractannotation import AbstractAnnotation
 from .core.alignmentdata import AlignmentData
+from .core.alignmentresult import AlignmentResult
 from .core.annotation import Annotation
 from .core.blastdata import BlastData
 from .core.clustalomegadata import ClustalOmegaData
+from .core.clustalomegaresult import ClustalOmegaResult
 from .core.cluster import Cluster
 from .core.dnarecord import DNARecord
 from .core.ontology import Ontology
 from .core.organism import Organism
 from .core.pairwisealignment import PairwiseAlignment
+from .core.pairwisealignmentresult import PairwiseAlignmentResult
 from .core.proteinrecord import ProteinRecord
 from .core.region import Region
 from .core.sequence import Sequence

diff --git a/pyeed/align/pairwise_aligner.py b/pyeed/align/pairwise_aligner.py
@@ -1,17 +1,23 @@
 from itertools import combinations
-from typing import Dict, List, TYPE_CHECKING
+from typing import Dict, List
 
-from tqdm import tqdm
-from joblib import Parallel, delayed, cpu_count
+from Bio.Align import Alignment as Alignment
 from Bio.Align import PairwiseAligner as BioPairwiseAligner
+from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix
+from joblib import Parallel, cpu_count, delayed
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
 
+from pyeed.core.pairwisealignmentresult import PairwiseAlignmentResult
 
-if TYPE_CHECKING:
-    from Bio.Align import Alignment as Alignment
-    from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix
 
 class PairwiseAligner:
-
     def __init__(self, mode: str) -> None:
         self.mode = mode
         self.match = 1
@@ -20,98 +26,156 @@ def __init__(self, mode: str) -> None:
         self.gap_extend = 0
         self.substitution_matrix = "None"
 
-    def align_pairwise(self, seq1: str, seq2: str, seq1_id: str, seq2_id: str) -> dict:
-        """
-        This function aligns two sequences and returns the alignment results.
+    def _align(
+        self,
+        seq1: Dict[str, str],
+        seq2: Dict[str, str],
+    ) -> Alignment:
+        """Aligns two sequences and returns the alignment results.
 
         Args:
-            seq1 (str): The first sequence to be aligned.
-            seq2 (str): The second sequence to be aligned.
+            seq1 (Dict[str, str]): Sequence 1 to align. Key is the sequence ID.
+            seq2 (Dict[str, str]): Sequence 2 to align. Key is the sequence ID.
+            progress (Progress, optional): `Rich` progress. Defaults to None.
+            task_id (TaskID, optional): `Rich` task_id. Defaults to None.
+
         Returns:
-            dict: The alignment results.
+            Alignment: The alignment results.
         """
-        aligner = BioPairwiseAligner()
-        aligner.mode = self.mode
-        aligner.match_score = self.match
-        aligner.mismatch_score = self.mismatch
-        aligner.open_gap_score = self.gap_open
-        aligner.extend_gap_score = self.gap_extend
 
-        if self.substitution_matrix != "None":
-            aligner.substitution_matrix = self._load_substitution_matrix()
+        aligner = self._get_aligner()
 
-        alignment_result = aligner.align(seq1, seq2)
+        results = aligner.align(list(seq1.values())[0], list(seq2.values())[0])
 
-        alignment_result_dic = self._map_alignment_results(alignment_result[0], seq1_id, seq2_id)
+        return results[0]
 
-        return alignment_result_dic
+    def align_pairwise(
+        self,
+        seq1: Dict[str, str],
+        seq2: Dict[str, str],
+    ) -> PairwiseAlignmentResult:
+        """Aligns two sequences and returns the alignment results.
 
-    def align_multipairwise(self, sequences: Dict[str, str], **kwargs) -> List[dict]:
-        """
-        
+        Args:
+            seq1 (Dict[str, str]): Sequence 1 to align. Key is the sequence ID.
+            seq2 (Dict[str, str]): Sequence 2 to align. Key is the sequence ID.
+
+        Returns:
+            PairwiseAlignmentResult: The alignment results.
         """
+
+        alignment_result = self._align(seq1, seq2)
+
+        return self._map_alignment_results(alignment_result, seq1, seq2)
+
+    def align_multipairwise(self, sequences: Dict[str, str], **kwargs) -> List[dict]:
+        """ """
         # the sequences are stored in a dictionary, key = source.id value = sequence
         # this creates a list of all possible pairs of sequences, on this list the alignment has to happen
         pairs = list(combinations(sequences.keys(), 2))
 
+        progress_bar = Progress(
+            TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+            BarColumn(),
+            MofNCompleteColumn(),
+            TextColumn("•"),
+            TimeElapsedColumn(),
+            TextColumn("•"),
+            TimeRemainingColumn(),
+        )
+
+        alignments = []
+        with progress_bar as progress:
+            alignments = Parallel(n_jobs=cpu_count(), prefer="processes")(
+                delayed(self.align_pairwise)(
+                    {pair[0]: sequences[pair[0]]}, {pair[1]: sequences[pair[1]]}
+                )
+                for pair in progress.track(pairs, description="⛓️Aligning sequences...")
+            )
+
+        return alignments
+
+    def _get_aligner(self) -> BioPairwiseAligner:
+        """Creates a BioPython pairwise aligner object with the specified parameters
+        from the class instance."""
         aligner = BioPairwiseAligner()
         aligner.mode = self.mode
+        aligner.match_score = self.match
+        aligner.mismatch_score = self.mismatch
+        aligner.open_gap_score = self.gap_open
+        aligner.extend_gap_score = self.gap_extend
 
+        if self.substitution_matrix != "None":
+            aligner.substitution_matrix = self._load_substitution_matrix()
 
-        alignments = Parallel(n_jobs=cpu_count(), prefer="processes")(
-            delayed(self.align_pairwise)(sequences[a[0]], sequences[a[1]], a[0], a[1])
-            for a in tqdm(pairs, desc="⛓️ Running pairwise alignments")
-        )
-
-        return alignments
+        return aligner
 
-    def _map_alignment_results(self, alignment: "Alignment", seq1_id, seq2_id) -> List[dict]:
+    def _map_alignment_results(
+        self, alignment: Alignment, seq1: Dict[str, str], seq2: Dict[str, str]
+    ) -> PairwiseAlignmentResult:
         # this maps the alignment results to a dictionary
         # this dictionary is used to create the network graph
 
         shorter_seq = min(alignment[0], alignment[1], key=lambda x: len(x))
 
         identities = alignment.counts().identities
         identity = identities / len(shorter_seq)
+        gaps = alignment.counts().gaps
+        mismatches = alignment.counts().mismatches
+
+        sequences = [
+            {"id": list(seq1.keys())[0], "sequence": list(seq1.values())[0]},
+            {"id": list(seq2.keys())[0], "sequence": list(seq2.values())[0]},
+        ]
+
+        aligned_sequences = [
+            {"id": list(seq1.keys())[0], "sequence": alignment[0]},
+            {"id": list(seq2.keys())[0], "sequence": alignment[1]},
+        ]
 
-        alignment_dic = {
-            "seq1": alignment[0],
-            "seq2": alignment[1],
-            "seq1_id": seq1_id,
-            "seq2_id": seq2_id,
+        result_dict = {
             "score": alignment.score,
-            "mismatches": 1 / (alignment.counts().mismatches + 1),
-            "gaps": 1 / (alignment.counts().gaps + 1),
             "identity": identity,
-            "start": alignment.aligned[0],
-            "end": alignment.aligned[1],
+            "gaps": gaps,
+            "mismatches": mismatches,
+            "sequences": sequences,
+            "aligned_sequences": aligned_sequences,
         }
 
-
-        return alignment_dic
+        # alignment_dic = {
+        #     "seq1": alignment[0],
+        #     "seq2": alignment[1],
+        #     "seq1_id": seq1_id,
+        #     "seq2_id": seq2_id,
+        #     "score": alignment.score,
+        #     "mismatches": 1 / (alignment.counts().mismatches + 1),
+        #     "gaps": 1 / (alignment.counts().gaps + 1),
+        #     "identity": identity,
+        #     "start": alignment.aligned[0],
+        #     "end": alignment.aligned[1],
+        # }
+
+        return result_dict
 
     def _load_substitution_matrix(self) -> "BioSubstitutionMatrix":
         from Bio.Align import substitution_matrices
 
         return substitution_matrices.load(self.substitution_matrix)
-
 
 
 if __name__ == "__main__":
     aligner = PairwiseAligner(mode="global")
-    seq1 = "ATCG"
-    seq2 = "AGTC"
-    seq3 = "AAAA"
-    seq4 = "TTTT"
-    sequences = {
-        "seq1": seq1,
-        "seq2": seq2,
-        "seq3": seq3,
-        "seq4": seq4
-    }
-    alignment_result = aligner.align_multipairwise(sequences=sequences)
-
+    seq1 = dict(s1="ATCG")
+    seq2 = dict(s2="ATCC")
+    seq3 = dict(s3="GGCC")
+    seq4 = dict(s4="GGGC")
+
+    all_seq = {**seq1, **seq2, **seq3, **seq4}
+
+    print(aligner.align_pairwise(seq1=seq1, seq2=seq2))
+
+    alignment_result = aligner.align_multipairwise(sequences=all_seq)
 
     print(alignment_result)
 
-    print(alignment_result)
+    print(alignment_result)
diff --git a/pyeed/core/__init__.py b/pyeed/core/__init__.py
@@ -1,13 +1,16 @@
 from .abstractannotation import AbstractAnnotation
 from .alignmentdata import AlignmentData
+from .alignmentresult import AlignmentResult
 from .annotation import Annotation
 from .blastdata import BlastData
 from .clustalomegadata import ClustalOmegaData
+from .clustalomegaresult import ClustalOmegaResult
 from .cluster import Cluster
 from .dnarecord import DNARecord
 from .ontology import Ontology
 from .organism import Organism
 from .pairwisealignment import PairwiseAlignment
+from .pairwisealignmentresult import PairwiseAlignmentResult
 from .proteinrecord import ProteinRecord
 from .region import Region
 from .sequence import Sequence

diff --git a/pyeed/core/abstractannotation.py b/pyeed/core/abstractannotation.py
@@ -49,11 +49,6 @@ class AbstractAnnotation(
         ),
     )
 
-    _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed")
-    _commit: Optional[str] = PrivateAttr(
-        default="a7defc5c87a2296a2e4b522b07236b2aef6413ac"
-    )
-
     _raw_xml_data: Dict = PrivateAttr(default_factory=dict)
 
     @model_validator(mode="after")