Skip to content

Commit

Permalink
Merge pull request #17 from kusterlab/16-re-add-uniqueness-sorting-as…
Browse files Browse the repository at this point in the history
…-optional-features

Re-add previously removed operations via optional parameters
  • Loading branch information
MatthewThe authored Nov 19, 2024
2 parents c821c38 + 5552afa commit a2797ec
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 14 deletions.
32 changes: 23 additions & 9 deletions psite_annotation/annotators/peptide_position.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def __init__(
returnAllPotentialSites: bool = False,
localization_uncertainty: int = 0,
mod_dict: Dict[str, str] = MOD_DICT,
return_unique: bool = False,
return_sorted: bool = False,
) -> None:
"""
Initialize the input files and options for PeptidePositionAnnotator.
Expand All @@ -56,6 +58,8 @@ def __init__(
self.localization_uncertainty = localization_uncertainty
self.protein_sequences = None
self.mod_dict = mod_dict
self.return_unique = return_unique
self.return_sorted = return_sorted

def load_annotations(self) -> None:
"""Reads in protein sequences from fasta file."""
Expand Down Expand Up @@ -106,15 +110,17 @@ def annotate(self, df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame:
]
] = annotated_df[["Proteins", "Modified sequence"]].apply(
lambda x: _get_peptide_positions(
x["Proteins"],
self.protein_sequences,
x["Modified sequence"],
self.returnAllPotentialSites,
self.localization_uncertainty,
self.mod_dict,
mod_regex,
mod_pattern,
potential_mods,
proteinIds=x["Proteins"],
protein_sequences=self.protein_sequences,
mod_peptide_sequence=x["Modified sequence"],
return_unique=self.return_unique,
return_sorted=self.return_sorted,
returnAllPotentialSites=self.returnAllPotentialSites,
localization_uncertainty=self.localization_uncertainty,
mod_dict=self.mod_dict,
mod_regex=mod_regex,
mod_pattern=mod_pattern,
potential_mods=potential_mods,
),
axis=1,
result_type="expand",
Expand Down Expand Up @@ -167,6 +173,8 @@ def _get_peptide_positions(
protein_sequences: Dict[str, str],
mod_peptide_sequence: str,
returnAllPotentialSites: bool = False,
return_unique: bool = False,
return_sorted: bool = False,
localization_uncertainty: int = 0,
mod_dict: Dict[str, str] = MOD_DICT,
mod_regex: Pattern = _get_mod_regex(MOD_DICT),
Expand Down Expand Up @@ -224,6 +232,12 @@ def _get_peptide_positions(

proteinPositions.append(site_position_string)

if return_unique:
proteinPositions = set(proteinPositions)

if return_sorted:
proteinPositions = sorted(proteinPositions)

return (
";".join(map(str, matchedProteins)),
";".join(map(str, startPositions)),
Expand Down
19 changes: 18 additions & 1 deletion psite_annotation/annotators/site_sequence_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def __init__(
context_left: int = 15,
context_right: int = 15,
retain_other_mods: bool = False,
return_unique: bool=False,
return_sorted: bool=False,
):
"""
Initialize the input files and options for PeptidePositionAnnotator.
Expand All @@ -47,6 +49,8 @@ def __init__(
self.context_left = context_left
self.context_right = context_right
self.retain_other_mods = retain_other_mods
self.return_unique = return_unique
self.return_sorted = return_sorted

def load_annotations(self) -> None:
"""Reads in protein sequences from fasta file."""
Expand Down Expand Up @@ -85,13 +89,19 @@ def annotate(self, df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame:
context_left=self.context_left,
context_right=self.context_right,
retain_other_mods=self.retain_other_mods,
return_unique=self.return_unique,
return_sorted=self.return_sorted,
)
)
return annotated_df


def _get_site_sequence_contexts(
site_position_string: str, protein_sequences: Dict[str, str], **kwargs
site_position_string: str,
protein_sequences: Dict[str, str],
return_unique: bool = False,
return_sorted: bool = False,
**kwargs
) -> str:
if len(site_position_string) == 0:
return ""
Expand All @@ -103,6 +113,13 @@ def _get_site_sequence_contexts(
),
site_position_strings,
)

if return_unique:
contexts = set(contexts)

if return_sorted:
contexts = sorted(contexts)

return ";".join(contexts)


Expand Down
14 changes: 14 additions & 0 deletions psite_annotation/functional_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def addPeptideAndPsitePositions(
context_right: int = 15,
retain_other_mods: bool = False,
mod_dict: Dict[str, str] = None,
return_unique: bool = False,
return_sorted: bool = False,
) -> pd.DataFrame:
"""Annotate pandas dataframe with positions of the peptide within the protein sequence based on a fasta file.
Expand Down Expand Up @@ -96,6 +98,8 @@ def addPeptideAndPsitePositions(
context_right: number of amino acids to the right of the modification to include
retain_other_mods: retain other modifications from the modified peptide in the sequence context in lower case
mod_dict: dictionary of modifications to single amino acid replacements, e.g. :code:`{"S(ph)": "s", "T(ph)": "t", "Y(ph)": "y"}`. If set to :code:`None`, uses the default annotations for S, T and Y phosphorylation.
return_unique: eliminate duplicates from the 'Site sequence context' and Site positions' columns, not preserving the order between the them and the rest of the data frame
return_sorted: sort the 'Site sequence context' and Site positions' columns alphabetically, not preserving the order between the them and the rest of the data frame
Returns:
pd.DataFrame: annotated dataframe
Expand All @@ -110,6 +114,8 @@ def addPeptideAndPsitePositions(
returnAllPotentialSites=returnAllPotentialSites,
localization_uncertainty=localization_uncertainty,
mod_dict=mod_dict,
return_unique=return_unique,
return_sorted=return_sorted,
)
peptide_position_annotator.load_annotations()
df = peptide_position_annotator.annotate(df)
Expand All @@ -120,6 +126,8 @@ def addPeptideAndPsitePositions(
context_left=context_left,
context_right=context_right,
retain_other_mods=retain_other_mods,
return_unique=return_unique,
return_sorted=return_sorted,
)
site_seq_context_annotator.load_annotations()
df = site_seq_context_annotator.annotate(df)
Expand All @@ -134,6 +142,8 @@ def addSiteSequenceContext(
context_left: int = 15,
context_right: int = 15,
retain_other_mods: bool = False,
return_unique: bool = False,
return_sorted: bool = False,
) -> pd.DataFrame:
"""Annotate pandas dataframe with sequence context of a p-site.
Expand All @@ -151,6 +161,8 @@ def addSiteSequenceContext(
context_left: number of amino acids to the left of the modification to include
context_right: number of amino acids to the right of the modification to include
retain_other_mods: retain other modifications from the modified peptide in the sequence context in lower case
return_unique: eliminate duplicated sequences from the 'Site sequence context' column, not preserving the order between the this column and the rest of the data frame
return_sorted: sort the sequences from the 'Site sequence context' column alphabetically, not preserving the order between the this column and the rest of the data frame
Returns:
pd.DataFrame: annotated dataframe
Expand All @@ -162,6 +174,8 @@ def addSiteSequenceContext(
context_left=context_left,
context_right=context_right,
retain_other_mods=retain_other_mods,
return_unique=return_unique,
return_sorted=return_sorted,
)
annotator.load_annotations()
df = annotator.annotate(df)
Expand Down
38 changes: 35 additions & 3 deletions tests/unit_tests/annotators/test_peptide_position.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,10 +407,41 @@ def test_get_peptide_positions_all_potential_sites(
"Q86U42_S19;Q86U42_T20;Q86U42_Y21;Q86U42-2_S19;Q86U42-2_T20;Q86U42-2_Y21",
)

def test_get_peptide_positions_all_potential_sites(

# Not sure if this is ever desired behaviour, but it's how the old implementation would have handled it:
# Make the Site positions unique, but not the other three columns.
# I would either not add the set() at all, or add it to all four output columns
# The latter might produce undesired results for Start/End positions, if they agree but the Protein IDs are different
def test_get_peptide_positions_all_potential_sites_unique(
self, proteinSequencesExtraPhospho
):
"""Test the _get_peptide_positions function with two isoforms and the returnAllPotentialSites option.
"""Test the _get_peptide_positions function with two isoforms and the returnAllPotentialSites option, eliminating duplicates.
Args:
proteinSequencesExtraPhospho: dictionary of UniProt identifiers to protein sequences
"""
proteinIds = "Q86U42;Q86U42-2;Q86U42"
modPeptideSequence = "(ac)AAAAAAAAAAGAAGGRGS(ph)TYGPGR"

assert pa._get_peptide_positions(
proteinIds,
proteinSequencesExtraPhospho,
modPeptideSequence,
returnAllPotentialSites=True,
return_unique=True,
return_sorted=True
) == (
"Q86U42;Q86U42-2;Q86U42",
"1;1;1",
"25;25;25",
'Q86U42-2_S19;Q86U42-2_T20;Q86U42-2_Y21;Q86U42_S19;Q86U42_T20;Q86U42_Y21',
)

def test_get_peptide_positions_all_potential_sites_sorted(
self, proteinSequencesExtraPhospho
):
"""Test the _get_peptide_positions function with two isoforms and the returnAllPotentialSites option, sorting sites alphabetically.
Args:
proteinSequencesExtraPhospho: dictionary of UniProt identifiers to protein sequences
Expand All @@ -424,11 +455,12 @@ def test_get_peptide_positions_all_potential_sites(
proteinSequencesExtraPhospho,
modPeptideSequence,
returnAllPotentialSites=True,
return_sorted=True
) == (
"Q86U42;Q86U42-2",
"1;1",
"25;25",
"Q86U42_S19;Q86U42_T20;Q86U42_Y21;Q86U42-2_S19;Q86U42-2_T20;Q86U42-2_Y21",
"Q86U42-2_S19;Q86U42-2_T20;Q86U42-2_Y21;Q86U42_S19;Q86U42_T20;Q86U42_Y21",
)


Expand Down
30 changes: 29 additions & 1 deletion tests/unit_tests/annotators/test_site_sequence_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,34 @@ def test_get_site_sequence_contexts(self, proteinSequences):
== "AAAAAAAAGAAGGRGsGPGRRRHLVPGAGGE;AAAAAAAAGAAGGRGsGPGRRRHLVPGAGGE"
)

def test_get_site_sequence_contexts_unique(self, proteinSequences):
"""Test the _get_site_sequence_contexts function with duplicate elimination.
Args:
proteinSequences: dictionary of UniProt identifiers to protein sequences
"""
site_position_string = "Q86U42-2_S19;Q86U42_S19"

assert (
pa._get_site_sequence_contexts(site_position_string, proteinSequences, return_unique=True)
== "AAAAAAAAGAAGGRGsGPGRRRHLVPGAGGE"
)

def test_get_site_sequence_contexts_sorted(self, proteinSequences):
"""Test the _get_site_sequence_contexts function with sorting of output
Args:
proteinSequences: dictionary of UniProt identifiers to protein sequences
"""
site_position_string = "Q86U42_Y46;Q86U42_S19"

assert (
pa._get_site_sequence_contexts(site_position_string, proteinSequences, return_sorted=True)
== "AAAAAAAAGAAGGRGsGPGRRRHLVPGAGGE;AGGEAGEGAPGGAGDyGNGLESEELEPEELL"
)

def test_get_site_sequence_contexts_custom_context(self, proteinSequences):
"""Test the _get_site_sequence_contexts function with two isoforms with the identical site sequence context.
Expand Down Expand Up @@ -431,7 +459,7 @@ def test_valid_format(self):
def test_invalid_format(self):
with pytest.raises(ValueError, match="Invalid format for site_position_string"):
pa._unpack_site_position_string("invalid_string")



# You may need to adjust the imports and module names based on your actual module structure.

0 comments on commit a2797ec

Please sign in to comment.