Skip to content

Commit

Permalink
added fields
Browse files Browse the repository at this point in the history
  • Loading branch information
NiklasAbraham committed Apr 9, 2024
1 parent 67613db commit d2837a5
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 36 deletions.
118 changes: 83 additions & 35 deletions pyeed/network/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,46 +71,98 @@ class SequenceNetwork(BaseModel):
default=[],
description="List of selected sequences",
)

### Alignment results they will need to be stored, but can't be in a pd.DataFrame, therefore here we go with fieilds
"""
"source_id": pair[0].source_id,
"target_id": pair[1].source_id,
"score": alignment_result.score,
"identity": identity,
"similarity": alignment_result.similarity,
"gaps": alignment_result.gaps,
"mismatches": alignment_result.mismatches,
"mode": mode,
"""
source_ids: Optional[List[str]] = Field(
default=[],
description="Source sequence IDs",
)

target_ids: Optional[List[str]] = Field(
default=[],
description="Target sequence IDs",
)

scores: Optional[List[float]] = Field(
default=[],
description="Alignment scores",
)

identities: Optional[List[float]] = Field(
default=[],
description="Alignment identities",
)

gaps: Optional[List[int]] = Field(
default=[],
description="Alignment gaps",
)

mismatches: Optional[List[int]] = Field(
default=[],
description="Alignment mismatches",
)

mode: Optional[str] = Field(
default=None,
description="Mode of the pairwise alignment",
)

###### End of alignment results defintion

shorter_seq: Optional[int] = Field(
default=None,
description="Shorter sequence in the network",
)


def __init__(self, sequences: List[AbstractSequence], weight: str = "identity", color: str = "name", threshold: float = None, label: str = "name", dimensions: int = 3):
super().__init__()
self._alignments = self._create_pairwise_alignments()
self.shorter_seq = None
self._create_pairwise_alignments(sequences, aligner=PairwiseAligner, mode="global")


def add_target(self, target: AbstractSequence):
if target.source_id not in self.targets:
self.targets.append(target.source_id)


def _map_pairwise_alignment_results(self, alignment_result: BioAlignment, pair: Tuple[Sequence, Sequence], mode: str) -> "pd.DataFrame":
def _map_pairwise_alignment_results(self, alignment_result: BioAlignment, pair: Tuple[Sequence, Sequence], mode: str):
"""
Maps the results of pairwise alignments to a dataframe.
Maps the results of pairwise alignments to internal fields.
Args:
alignment_result (BioAlignment): The result of the pairwise alignment.
pair (Tuple[Sequence, Sequence]): The pair of sequences being aligned.
mode (str): The mode of the pairwise alignment.
Returns:
PairwiseAlignmentDataFrame: A dataframe with the alignment results.
None
"""
df = pd.DataFrame(
{
"source_id": pair[0].source_id,
"target_id": pair[1].source_id,
"score": alignment_result.score,
"identity": alignment_result.identity,
"similarity": alignment_result.similarity,
"gaps": alignment_result.gaps,
"mismatches": alignment_result.mismatches,
"mode": mode,
},
index=[0],
)
return df

def _create_pairwise_alignments(self, aligner: "PairwiseAligner", **kwargs):
identities = alignment_result.counts().identities
identity = identities / len(self.shorter_seq.sequence)

self.source_ids.append(pair[0].source_id)
self.target_ids.append(pair[1].source_id)
self.scores.append(alignment_result.score)
self.identities.append(identity)
# self.similarities.append(alignment_result.similarity)
self.gaps.append(alignment_result.counts().gaps)
self.mismatches.append(alignment_result.counts().mismatches)


def _create_pairwise_alignments(self, input_sequences, aligner: "PairwiseAligner", **kwargs):
"""
Creates pairwise alignments between sequences. And writes the aligments in the right dataframe structure. This structure is later used in cytoscope to build the graph.
Expand All @@ -125,34 +177,36 @@ def _create_pairwise_alignments(self, aligner: "PairwiseAligner", **kwargs):
ValueError: If the number of sequences is less than 2.
Returns:
Dataframe with the aligments
Nothing the data is stored internally in fields of the class.
"""
# this is later used as a normaliztion factor in the idetnty entry in the dataframe
self.shorter_seq = min(input_sequences, key=lambda x: len(x.sequence))

# Pairwise alignment
if len(self.input_sequences) == 2:
if len(input_sequences) == 2:

pairwise_aligner = aligner(
sequences=[
self.input_sequences[0].sequence,
self.input_sequences[1].sequence,
input_sequences[0].sequence,
input_sequences[1].sequence,
],
**kwargs,
)
alignment_result = pairwise_aligner.align()
self.method = pairwise_aligner.mode
self.mode = pairwise_aligner.mode

return self._map_pairwise_alignment_results(
alignment_result,
pair=(
self.input_sequences[0],
self.input_sequences[1],
input_sequences[0],
input_sequences[1],
),
mode=pairwise_aligner.mode,
)

# Multi pairwise alignment
elif len(self.input_sequences) > 2:
pairs = list(combinations(self.input_sequences, 2))
elif len(input_sequences) > 2:
pairs = list(combinations(input_sequences, 2))

aligners = [
aligner(sequences=[s.sequence for s in pair], **kwargs)
Expand All @@ -164,14 +218,8 @@ def _create_pairwise_alignments(self, aligner: "PairwiseAligner", **kwargs):
for a in tqdm(aligners, desc="⛓️ Running pairwise alignments")
)

df_alignment = None
for alignment, pair in zip(alignments, pairs):
if df_alignment is None:
df_alignment = self._map_pairwise_alignment_results(alignment, pair, mode=aligners[0].mode)
else:
df_alignment = pd.concat(self._map_pairwise_alignment_results(alignment, pair, mode=aligners[0].mode), df_alignment)

return df_alignment
self._map_pairwise_alignment_results(alignment, pair, mode=aligners[0].mode)

else:
raise ValueError(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pyeed.network import SequenceNetwork
from pyeed.core import ProteinInfo

class TestNetworkDataframe:
class TestNetworkGraphBuild:

def test_general(self):
# check if it can be created and does the basic job
Expand All @@ -29,3 +29,27 @@ def test_general(self):
color="taxonomy_id",
)

def test_graph_build(self):
mat_accessions = [
"MBP1912539.1",
"SEV92896.1",
"MBO8174569.1",
"WP_042680787.1",
"NPA47376.1",
"WP_167889085.1",
"WP_048165429.1",
"ACS90033.1",
]
mats = ProteinInfo.get_ids(mat_accessions)
# Create a network
network = SequenceNetwork(
sequences=mats,
weight="identity",
threshold=0.9,
dimensions=2,
color="taxonomy_id",
)

# Check if the graph is created
graph = network.graph()

0 comments on commit d2837a5

Please sign in to comment.