Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modified decoy scrambler to no longer use static Random generator #798

Merged
merged 3 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions mzLib/Proteomics/Protein/Protein.cs
Original file line number Diff line number Diff line change
Expand Up @@ -836,21 +836,22 @@ public static Protein ScrambleDecoyProteinSequence(
string scrambledProteinSequence = originalDecoyProtein.BaseSequence;
// Clone the original protein's modifications
var scrambledModificationDictionary = originalDecoyProtein.OriginalNonVariantModifications.ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
Random rng = new Random(42);

// Start small and then go big. If we scramble a zero-missed cleavage peptide, but the missed cleavage peptide contains the previously scrambled peptide
// Then we can avoid unnecessary operations as the scrambledProteinSequence will no longer contain the longer sequence of the missed cleavage peptide
foreach(string peptideSequence in sequencesToScramble.OrderBy(seq => seq.Length))
{
if(scrambledProteinSequence.Contains(peptideSequence))
{
string scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs,
string scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, rng,
out var swappedArray);
int scrambleAttempts = 1;

// Try five times to scramble the peptide sequence without creating a forbidden sequence
while(forbiddenSequences.Contains(scrambledPeptideSequence) & scrambleAttempts <= 5)
{
scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs,
scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, rng,
out swappedArray);
scrambleAttempts++;
}
Expand Down Expand Up @@ -896,13 +897,11 @@ public static Protein ScrambleDecoyProteinSequence(
return newProtein;
}

private static Random rng = new Random(42);

/// <summary>
/// Scrambles a peptide sequence, preserving the position of any cleavage sites.
/// </summary>
/// <param name="swappedPositionArray">An array that maps the previous position (index) to the new position (value)</param>
public static string ScrambleSequence(string sequence, List<DigestionMotif> motifs, out int[] swappedPositionArray)
public static string ScrambleSequence(string sequence, List<DigestionMotif> motifs, Random rng, out int[] swappedPositionArray)
{
// First, find the location of every cleavage motif. These sites shouldn't be scrambled.
HashSet<int> zeroBasedCleavageSitesLocations = new();
Expand Down
36 changes: 35 additions & 1 deletion mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// opyright 2016 Stefan Solntsev
// Copyright 2016 Stefan Solntsev
//
// This file (ChemicalFormula.cs) is part of Chemistry Library.
//
Expand Down Expand Up @@ -28,6 +28,7 @@
using Omics.Modifications;
using UsefulProteomicsDatabases;
using Stopwatch = System.Diagnostics.Stopwatch;
using NUnit.Framework.Legacy;

namespace Test.DatabaseTests
{
Expand Down Expand Up @@ -81,6 +82,39 @@ public static void LoadIsoforms()
Assert.AreEqual("Q14103-4", proteinXml[9].Accession);
}

[Test]
[TestCase("cRAP_databaseGPTMD.xml", DecoyType.None)]
[TestCase("uniprot_aifm1.fasta", DecoyType.None)]
[TestCase("cRAP_databaseGPTMD.xml", DecoyType.Reverse)]
[TestCase("uniprot_aifm1.fasta", DecoyType.Reverse)]
public void LoadingIsReproducible(string fileName, DecoyType decoyType)
{
// Load in proteins
var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", fileName);
List<Protein> proteins1 = null;
List<Protein> proteins2 = null;
if(fileName.Contains(".xml"))
{
proteins1 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out var unknownModifications);
proteins2 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out unknownModifications);
}
else if (fileName.Contains(".fasta"))
{
proteins1 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out var unknownModifications);
proteins2 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out unknownModifications);
}
else
{
Assert.Fail("Unknown file type");
}

// check are equivalent lists of proteins
Assert.AreEqual(proteins1.Count, proteins2.Count);
// Because decoys are written in a parallel environment, there is no guarantee that the orders will be the same
CollectionAssert.AreEquivalent(proteins1.Select(p => p.Accession), proteins2.Select(p => p.Accession));
CollectionAssert.AreEquivalent(proteins1.Select(p => p.BaseSequence), proteins2.Select(p => p.BaseSequence));
}

[Test]
public static void LoadModWithNl()
{
Expand Down
67 changes: 67 additions & 0 deletions mzLib/Test/TestProteinDigestion.cs
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,73 @@ public static void Test_ProteinDigest()
Assert.AreEqual("MED[mt:mod1 on D]EEK", pep2.FullSequence);
}

[Test]
[TestCase("cRAP_databaseGPTMD.xml")]
[TestCase("uniprot_aifm1.fasta")]
public static void TestDecoyScramblingIsReproducible(string fileName)
{
// Load in proteins
var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", fileName);
DecoyType decoyType = DecoyType.Reverse;
List<Protein> proteins1 = null;
List<Protein> proteins2 = null;
if (fileName.Contains(".xml"))
{
proteins1 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out var unknownModifications);
proteins2 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out unknownModifications);
}
else if (fileName.Contains(".fasta"))
{
proteins1 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out var unknownModifications);
proteins2 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out unknownModifications);
}
else
{
Assert.Fail("Unknown file type");
}

DigestionParams d = new DigestionParams(
maxMissedCleavages: 1,
minPeptideLength: 5,
initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain);
// Digest target proteins
var pepsToReplace = proteins1.Where(p => !p.IsDecoy)
.SelectMany(p => p.Digest(d, new List<Modification>(), new List<Modification>()).ToList())
.Select(pep => pep.BaseSequence)
.ToHashSet();

// Ensure at least one decoy peptide from each protein is problematic and must be replaced
var singleDecoyPeptides = proteins1
.Where(p => p.IsDecoy)
.Select(p => p.Digest(d, new List<Modification>(), new List<Modification>()).Skip(2).Take(1))
.Select(pwsm => pwsm.First().BaseSequence)
.ToHashSet();

//modify targetpeptides in place
pepsToReplace.UnionWith(singleDecoyPeptides);

// Scramble every decoy from db1
List<Protein> decoys1 = new();
foreach (var protein in proteins1.Where(p => p.IsDecoy))
{
decoys1.Add(Protein.ScrambleDecoyProteinSequence(protein, d, pepsToReplace));
}
// Scramble every decoy from db2
List<Protein> decoys2 = new();
foreach (var protein in proteins2.Where(p => p.IsDecoy))
{
decoys2.Add(Protein.ScrambleDecoyProteinSequence(protein, d, pepsToReplace));
}

// check are equivalent lists of proteins
Assert.AreEqual(decoys1.Count, decoys2.Count);
foreach (var decoyPair in decoys1.Concat(decoys2).GroupBy(p => p.Accession))
{
Assert.AreEqual(2, decoyPair.Count());
Assert.AreEqual(decoyPair.First().BaseSequence, decoyPair.Last().BaseSequence);
}
}

[Test]
public static void TestDecoyScramblerReplacesPeptides()
{
Expand Down
Loading