Merge pull request #17 from kusterlab/16-re-add-uniqueness-sorting-as…

…-optional-features Re-add previously removed operations via optional parameters
kusterlab · Nov 19, 2024 · a2797ec · a2797ec
2 parents c821c38 + 5552afa
commit a2797ec
Show file tree

Hide file tree

Showing 5 changed files with 119 additions and 14 deletions.
diff --git a/psite_annotation/annotators/peptide_position.py b/psite_annotation/annotators/peptide_position.py
@@ -38,6 +38,8 @@ def __init__(
         returnAllPotentialSites: bool = False,
         localization_uncertainty: int = 0,
         mod_dict: Dict[str, str] = MOD_DICT,
+            return_unique: bool = False,
+            return_sorted: bool = False,
     ) -> None:
         """
         Initialize the input files and options for PeptidePositionAnnotator.
@@ -56,6 +58,8 @@ def __init__(
         self.localization_uncertainty = localization_uncertainty
         self.protein_sequences = None
         self.mod_dict = mod_dict
+        self.return_unique = return_unique
+        self.return_sorted = return_sorted
 
     def load_annotations(self) -> None:
         """Reads in protein sequences from fasta file."""
@@ -106,15 +110,17 @@ def annotate(self, df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame:
             ]
         ] = annotated_df[["Proteins", "Modified sequence"]].apply(
             lambda x: _get_peptide_positions(
-                x["Proteins"],
-                self.protein_sequences,
-                x["Modified sequence"],
-                self.returnAllPotentialSites,
-                self.localization_uncertainty,
-                self.mod_dict,
-                mod_regex,
-                mod_pattern,
-                potential_mods,
+                proteinIds=x["Proteins"],
+                protein_sequences=self.protein_sequences,
+                mod_peptide_sequence=x["Modified sequence"],
+                return_unique=self.return_unique,
+                return_sorted=self.return_sorted,
+                returnAllPotentialSites=self.returnAllPotentialSites,
+                localization_uncertainty=self.localization_uncertainty,
+                mod_dict=self.mod_dict,
+                mod_regex=mod_regex,
+                mod_pattern=mod_pattern,
+                potential_mods=potential_mods,
             ),
             axis=1,
             result_type="expand",
@@ -167,6 +173,8 @@ def _get_peptide_positions(
     protein_sequences: Dict[str, str],
     mod_peptide_sequence: str,
     returnAllPotentialSites: bool = False,
+        return_unique: bool = False,
+        return_sorted: bool = False,
     localization_uncertainty: int = 0,
     mod_dict: Dict[str, str] = MOD_DICT,
     mod_regex: Pattern = _get_mod_regex(MOD_DICT),
@@ -224,6 +232,12 @@ def _get_peptide_positions(
 
                 proteinPositions.append(site_position_string)
 
+    if return_unique:
+        proteinPositions = set(proteinPositions)
+
+    if return_sorted:
+        proteinPositions = sorted(proteinPositions)
+
     return (
         ";".join(map(str, matchedProteins)),
         ";".join(map(str, startPositions)),

diff --git a/psite_annotation/annotators/site_sequence_context.py b/psite_annotation/annotators/site_sequence_context.py
@@ -29,6 +29,8 @@ def __init__(
         context_left: int = 15,
         context_right: int = 15,
         retain_other_mods: bool = False,
+        return_unique: bool=False,
+        return_sorted: bool=False,
     ):
         """
         Initialize the input files and options for PeptidePositionAnnotator.
@@ -47,6 +49,8 @@ def __init__(
         self.context_left = context_left
         self.context_right = context_right
         self.retain_other_mods = retain_other_mods
+        self.return_unique = return_unique
+        self.return_sorted = return_sorted
 
     def load_annotations(self) -> None:
         """Reads in protein sequences from fasta file."""
@@ -85,13 +89,19 @@ def annotate(self, df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame:
                 context_left=self.context_left,
                 context_right=self.context_right,
                 retain_other_mods=self.retain_other_mods,
+                return_unique=self.return_unique,
+                return_sorted=self.return_sorted,
             )
         )
         return annotated_df
 
 
 def _get_site_sequence_contexts(
-    site_position_string: str, protein_sequences: Dict[str, str], **kwargs
+        site_position_string: str,
+        protein_sequences: Dict[str, str],
+        return_unique: bool = False,
+        return_sorted: bool = False,
+        **kwargs
 ) -> str:
     if len(site_position_string) == 0:
         return ""
@@ -103,6 +113,13 @@ def _get_site_sequence_contexts(
         ),
         site_position_strings,
     )
+
+    if return_unique:
+        contexts = set(contexts)
+
+    if return_sorted:
+        contexts = sorted(contexts)
+
     return ";".join(contexts)
 
 

diff --git a/psite_annotation/functional_annotation.py b/psite_annotation/functional_annotation.py
@@ -61,6 +61,8 @@ def addPeptideAndPsitePositions(
     context_right: int = 15,
     retain_other_mods: bool = False,
     mod_dict: Dict[str, str] = None,
+        return_unique: bool = False,
+        return_sorted: bool = False,
 ) -> pd.DataFrame:
     """Annotate pandas dataframe with positions of the peptide within the protein sequence based on a fasta file.
 
@@ -96,6 +98,8 @@ def addPeptideAndPsitePositions(
         context_right: number of amino acids to the right of the modification to include
         retain_other_mods: retain other modifications from the modified peptide in the sequence context in lower case
         mod_dict: dictionary of modifications to single amino acid replacements, e.g. :code:`{"S(ph)": "s", "T(ph)": "t", "Y(ph)": "y"}`. If set to :code:`None`, uses the default annotations for S, T and Y phosphorylation.
+        return_unique: eliminate duplicates from the 'Site sequence context' and Site positions' columns, not preserving the order between the them and the rest of the data frame
+        return_sorted: sort the 'Site sequence context' and Site positions' columns alphabetically, not preserving the order between the them and the rest of the data frame
 
     Returns:
         pd.DataFrame: annotated dataframe
@@ -110,6 +114,8 @@ def addPeptideAndPsitePositions(
         returnAllPotentialSites=returnAllPotentialSites,
         localization_uncertainty=localization_uncertainty,
         mod_dict=mod_dict,
+        return_unique=return_unique,
+        return_sorted=return_sorted,
     )
     peptide_position_annotator.load_annotations()
     df = peptide_position_annotator.annotate(df)
@@ -120,6 +126,8 @@ def addPeptideAndPsitePositions(
         context_left=context_left,
         context_right=context_right,
         retain_other_mods=retain_other_mods,
+        return_unique=return_unique,
+        return_sorted=return_sorted,
     )
     site_seq_context_annotator.load_annotations()
     df = site_seq_context_annotator.annotate(df)
@@ -134,6 +142,8 @@ def addSiteSequenceContext(
     context_left: int = 15,
     context_right: int = 15,
     retain_other_mods: bool = False,
+    return_unique: bool = False,
+    return_sorted: bool = False,
 ) -> pd.DataFrame:
     """Annotate pandas dataframe with sequence context of a p-site.
 
@@ -151,6 +161,8 @@ def addSiteSequenceContext(
         context_left: number of amino acids to the left of the modification to include
         context_right: number of amino acids to the right of the modification to include
         retain_other_mods: retain other modifications from the modified peptide in the sequence context in lower case
+        return_unique: eliminate duplicated sequences from the 'Site sequence context' column, not preserving the order between the this column and the rest of the data frame
+        return_sorted: sort the sequences from the 'Site sequence context' column alphabetically, not preserving the order between the this column and the rest of the data frame
 
     Returns:
         pd.DataFrame: annotated dataframe
@@ -162,6 +174,8 @@ def addSiteSequenceContext(
         context_left=context_left,
         context_right=context_right,
         retain_other_mods=retain_other_mods,
+        return_unique=return_unique,
+        return_sorted=return_sorted,
     )
     annotator.load_annotations()
     df = annotator.annotate(df)

diff --git a/tests/unit_tests/annotators/test_peptide_position.py b/tests/unit_tests/annotators/test_peptide_position.py
@@ -407,10 +407,41 @@ def test_get_peptide_positions_all_potential_sites(
             "Q86U42_S19;Q86U42_T20;Q86U42_Y21;Q86U42-2_S19;Q86U42-2_T20;Q86U42-2_Y21",
         )
 
-    def test_get_peptide_positions_all_potential_sites(
+
+    # Not sure if this is ever desired behaviour, but it's how the old implementation would have handled it:
+    # Make the Site positions unique, but not the other three columns.
+    # I would either not add the set() at all, or add it to all four output columns
+    # The latter might produce undesired results for Start/End positions, if they agree but the Protein IDs are different
+    def test_get_peptide_positions_all_potential_sites_unique(
         self, proteinSequencesExtraPhospho
     ):
-        """Test the _get_peptide_positions function with two isoforms and the returnAllPotentialSites option.
+        """Test the _get_peptide_positions function with two isoforms and the returnAllPotentialSites option, eliminating duplicates.
+
+        Args:
+            proteinSequencesExtraPhospho: dictionary of UniProt identifiers to protein sequences
+
+        """
+        proteinIds = "Q86U42;Q86U42-2;Q86U42"
+        modPeptideSequence = "(ac)AAAAAAAAAAGAAGGRGS(ph)TYGPGR"
+
+        assert pa._get_peptide_positions(
+            proteinIds,
+            proteinSequencesExtraPhospho,
+            modPeptideSequence,
+            returnAllPotentialSites=True,
+            return_unique=True,
+            return_sorted=True
+        ) == (
+            "Q86U42;Q86U42-2;Q86U42",
+            "1;1;1",
+            "25;25;25",
+            'Q86U42-2_S19;Q86U42-2_T20;Q86U42-2_Y21;Q86U42_S19;Q86U42_T20;Q86U42_Y21',
+        )
+
+    def test_get_peptide_positions_all_potential_sites_sorted(
+        self, proteinSequencesExtraPhospho
+    ):
+        """Test the _get_peptide_positions function with two isoforms and the returnAllPotentialSites option, sorting sites alphabetically.
 
         Args:
             proteinSequencesExtraPhospho: dictionary of UniProt identifiers to protein sequences
@@ -424,11 +455,12 @@ def test_get_peptide_positions_all_potential_sites(
             proteinSequencesExtraPhospho,
             modPeptideSequence,
             returnAllPotentialSites=True,
+            return_sorted=True
         ) == (
             "Q86U42;Q86U42-2",
             "1;1",
             "25;25",
-            "Q86U42_S19;Q86U42_T20;Q86U42_Y21;Q86U42-2_S19;Q86U42-2_T20;Q86U42-2_Y21",
+            "Q86U42-2_S19;Q86U42-2_T20;Q86U42-2_Y21;Q86U42_S19;Q86U42_T20;Q86U42_Y21",
         )
 
 

diff --git a/tests/unit_tests/annotators/test_site_sequence_context.py b/tests/unit_tests/annotators/test_site_sequence_context.py
@@ -331,6 +331,34 @@ def test_get_site_sequence_contexts(self, proteinSequences):
             == "AAAAAAAAGAAGGRGsGPGRRRHLVPGAGGE;AAAAAAAAGAAGGRGsGPGRRRHLVPGAGGE"
         )
 
+    def test_get_site_sequence_contexts_unique(self, proteinSequences):
+        """Test the _get_site_sequence_contexts function with duplicate elimination.
+
+        Args:
+            proteinSequences: dictionary of UniProt identifiers to protein sequences
+
+        """
+        site_position_string = "Q86U42-2_S19;Q86U42_S19"
+
+        assert (
+            pa._get_site_sequence_contexts(site_position_string, proteinSequences, return_unique=True)
+            == "AAAAAAAAGAAGGRGsGPGRRRHLVPGAGGE"
+        )
+
+    def test_get_site_sequence_contexts_sorted(self, proteinSequences):
+        """Test the _get_site_sequence_contexts function with sorting of output
+
+        Args:
+            proteinSequences: dictionary of UniProt identifiers to protein sequences
+
+        """
+        site_position_string = "Q86U42_Y46;Q86U42_S19"
+
+        assert (
+            pa._get_site_sequence_contexts(site_position_string, proteinSequences, return_sorted=True)
+            == "AAAAAAAAGAAGGRGsGPGRRRHLVPGAGGE;AGGEAGEGAPGGAGDyGNGLESEELEPEELL"
+        )
+
     def test_get_site_sequence_contexts_custom_context(self, proteinSequences):
         """Test the _get_site_sequence_contexts function with two isoforms with the identical site sequence context.
 
@@ -431,7 +459,7 @@ def test_valid_format(self):
     def test_invalid_format(self):
         with pytest.raises(ValueError, match="Invalid format for site_position_string"):
             pa._unpack_site_position_string("invalid_string")
-            
+
 
 
 # You may need to adjust the imports and module names based on your actual module structure.