Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
trishorts committed Sep 5, 2024
2 parents bd5aefc + dd5e3b2 commit d60e32e
Show file tree
Hide file tree
Showing 18 changed files with 832 additions and 635 deletions.
2 changes: 1 addition & 1 deletion MetaMorpheus/EngineLayer/CommonParameters.cs
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ public int DeconvolutionMaxAssumedChargeState
/// This parameter determines which PSMs/Peptides will be used as postive training examples
/// when training the GBDT model for PEP.
/// </summary>
public double QValueCutoffForPepCalculation { get; private set; }
public double QValueCutoffForPepCalculation { get; set; }
public DigestionParams DigestionParams { get; private set; }
public bool ReportAllAmbiguity { get; private set; }
public int? NumberOfPeaksToKeepPerWindow { get; private set; }
Expand Down
26 changes: 17 additions & 9 deletions MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using EngineLayer.CrosslinkSearch;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
Expand Down Expand Up @@ -275,18 +276,25 @@ public static void PepQValueInverted(List<SpectralMatch> psms, bool peptideLevel

public void Compute_PEPValue(FdrAnalysisResults myAnalysisResults, List<SpectralMatch> psms)
{
if (psms[0].DigestionParams.Protease.Name == "top-down")
string searchType;
// Currently, searches of mixed data (bottom-up + top-down) are not supported
// PEP will be calculated based on the search type of the first file/PSM in the list, which isn't ideal
// This will be addressed in a future release
switch(psms[0].DigestionParams.Protease.Name)
{
myAnalysisResults.BinarySearchTreeMetrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psms, "top-down", this.FileSpecificParameters, this.OutputFolder);
case "top-down":
searchType = "top-down";
break;
default:
searchType = "standard";
break;
}
else if (psms[0].DigestionParams.Protease.Name == "crosslink")
if (psms[0] is CrosslinkSpectralMatch)
{
myAnalysisResults.BinarySearchTreeMetrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psms, "crosslink", this.FileSpecificParameters, this.OutputFolder);
}
else
{
myAnalysisResults.BinarySearchTreeMetrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psms, "standard", this.FileSpecificParameters, this.OutputFolder);
searchType = "crosslink";
}
myAnalysisResults.BinarySearchTreeMetrics = new PepAnalysisEngine(psms, searchType, FileSpecificParameters, OutputFolder).ComputePEPValuesForAllPSMs();

}

/// <summary>
Expand Down

Large diffs are not rendered by default.

70 changes: 70 additions & 0 deletions MetaMorpheus/EngineLayer/FdrAnalysis/PeptideMatchGroup.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
using Omics;
using Proteomics.ProteolyticDigestion;
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace EngineLayer
{
public class PeptideMatchGroup : IEnumerable<SpectralMatch>
{
public string PeptideFullSequence { get; }
public List<SpectralMatch> SpectralMatches { get; }

/// <summary>
/// This class groups all spectral matches associated with a given peptide together,
/// to facilitate the calculation of PEP values.
/// </summary>
/// <param name="fullPeptideSeq"> The full sequence to be used for grouping</param>
/// <param name="spectralMatches"> Every spectral match that matches the full sequence</param>
public PeptideMatchGroup(string fullPeptideSeq, List<SpectralMatch> spectralMatches)
{
PeptideFullSequence = fullPeptideSeq;
SpectralMatches = spectralMatches;
}

public static List<PeptideMatchGroup> GroupByBaseSequence(List<SpectralMatch> spectralMatches)
{
// This groups psms by base sequence, ensuring that PSMs with the same base sequence but different modifications are grouped together when training.

// TODO: Determine if it's better to group PSMs by base sequence or by full sequence.
return spectralMatches.GroupBy(p => p.BaseSequence)
.Select(group => new PeptideMatchGroup(group.Key, group.ToList()))
.OrderByDescending(matchGroup => matchGroup.Count())
.ThenByDescending(matchGroup => matchGroup.BestMatch.Score)
.ToList();
}

public IEnumerable<SpectralMatch> GetBestMatchByMod()
{
return SpectralMatches.GroupBy(p => p.FullSequence).Select(g => g.MaxBy(p => p));
}

/// <summary>
/// This function is called if there aren't enough peptides to train at the peptide level
/// </summary>
/// <param name="spectralMatches"></param>
/// <returns></returns>
public static List<PeptideMatchGroup> GroupByIndividualPsm(List<SpectralMatch> spectralMatches)
{
return spectralMatches.Select(psm => new PeptideMatchGroup(psm.FullSequence, new List<SpectralMatch> { psm }))
.ToList();
}

public SpectralMatch BestMatch => SpectralMatches.MaxBy(match => match);

public IEnumerator<SpectralMatch> GetEnumerator()
{
return SpectralMatches.GetEnumerator();
}

IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}

}
}
19 changes: 19 additions & 0 deletions MetaMorpheus/EngineLayer/ProteinScoringAndFdr/FdrCategory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,25 @@

namespace EngineLayer
{
/// <summary>
/// This enum is used to categorize the FDR of a peptide based on its cleavage specificity.
/// FullySpecific: The peptide is cleaved only at protease-specified cleavage sites.
/// SemiSpecific: The peptide is cleaved on one terminus at protease-specified cleavage sites and at non-specific site on the other terminus.
/// NonSpecific: The peptide is cleaved at non-specific sites on both termini.
///
/// In the Speedy Non-Specific Search use case, all three categories are used with modern search. For each spectrum, the lowest q-value peptide is chosen
/// rather than the highest scoring peptide.
///
/// In a classic NonSpecific search, I believe that only the NonSpecific category is used. Further, I believe that it includes peptides that are cleaved
/// at one or more protease-specified cleavage sites, but also at non-specific sites.
///
/// The Single-N or Single-C protease is a special case. The modern search table is populated only with peptide fragments including the specified terminus.
/// Fragments from the other terminus are not included.
///
/// This is not the same as Semi-Trypsin, which is a classic search where the protein is digested into peptides and then the database is further updated
/// the full set of peptides that could be generated by terminal degradation.
///
/// </summary>
public enum FdrCategory
{
//Cleavage Specificity
Expand Down
2 changes: 1 addition & 1 deletion MetaMorpheus/EngineLayer/SpectralMatch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ public void ResolveAllAmbiguities()
ModsChemicalFormula = PsmTsvWriter.Resolve(_BestMatchingBioPolymersWithSetMods.Select(b => b.Pwsm.AllModsOneIsNterminus.Select(c => (c.Value)))).ResolvedValue;
Notch = PsmTsvWriter.Resolve(_BestMatchingBioPolymersWithSetMods.Select(b => b.Notch)).ResolvedValue;

// if the PSM matches a target and a decoy and they are the SAME SEQUENCE, remove the decoy
//if the PSM matches a target and a decoy and they are the SAME SEQUENCE, remove the decoy
if (IsDecoy)
{
bool removedPeptides = false;
Expand Down
23 changes: 17 additions & 6 deletions MetaMorpheus/TaskLayer/FilteredPsms.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@

namespace TaskLayer
{
public enum FilterType
{
QValue,
PepQValue
}

/// <summary>
/// Contains a filtered list of PSMs.
/// All properties within this class are read-only, and should only be set on object construction
Expand All @@ -18,11 +24,11 @@ public class FilteredPsms : IEnumerable<SpectralMatch>
/// <summary>
/// Filter type can have only two values: "q-value" or "pep q-value"
/// </summary>
public string FilterType { get; init; }
public FilterType FilterType { get; init; }
public double FilterThreshold { get; init; }
public bool FilteringNotPerformed { get; init; }
public bool PeptideLevelFiltering { get; init; }
public FilteredPsms(List<SpectralMatch> filteredPsms, string filterType, double filterThreshold, bool filteringNotPerformed, bool peptideLevelFiltering)
public FilteredPsms(List<SpectralMatch> filteredPsms, FilterType filterType, double filterThreshold, bool filteringNotPerformed, bool peptideLevelFiltering)
{
FilteredPsmsList = filteredPsms;
FilterType = filterType;
Expand All @@ -37,13 +43,18 @@ private bool AboveThreshold(SpectralMatch psm)

switch (FilterType)
{
case "pep q-value":
case FilterType.PepQValue:
return psm.GetFdrInfo(PeptideLevelFiltering).PEP_QValue <= FilterThreshold;
default:
return psm.GetFdrInfo(PeptideLevelFiltering).QValue <= FilterThreshold && psm.GetFdrInfo(PeptideLevelFiltering).QValueNotch <= FilterThreshold;
}
}

public string GetFilterTypeString()
{
return FilterType == FilterType.PepQValue ? "pep q-value" : "q-value";
}

/// <summary>
/// This method should only be called when filtered PSMs are modified for the purpose of SILAC analysis
/// </summary>
Expand Down Expand Up @@ -87,7 +98,7 @@ public static FilteredPsms Filter(IEnumerable<SpectralMatch> psms,
List<SpectralMatch> filteredPsms = new List<SpectralMatch>();

// set the filter type
string filterType = "q-value";
FilterType filterType = FilterType.QValue;
if (pepQValueThreshold < qValueThreshold)
{
if (psms.Count() < 100)
Expand All @@ -97,13 +108,13 @@ public static FilteredPsms Filter(IEnumerable<SpectralMatch> psms,
}
else
{
filterType = "pep q-value";
filterType = FilterType.PepQValue;
}
}

if (!includeHighQValuePsms)
{
filteredPsms = filterType.Equals("q-value")
filteredPsms = filterType.Equals(FilterType.QValue)
? psms.Where(p => p.GetFdrInfo(filterAtPeptideLevel) != null
&& p.GetFdrInfo(filterAtPeptideLevel).QValue <= filterThreshold
&& p.GetFdrInfo(filterAtPeptideLevel).QValueNotch <= filterThreshold).ToList()
Expand Down
67 changes: 3 additions & 64 deletions MetaMorpheus/TaskLayer/MbrAnalysis/SpectralRecoveryRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,8 @@ public static SpectralRecoveryResults RunSpectralRecoveryAlgorithm(
List<SpectralMatch> allPsms = parameters.AllPsms.
OrderByDescending(p => p).ToList();

AssignEstimatedPsmQvalue(bestMbrMatches, allPsms);
FDRAnalysisOfMbrPsms(bestMbrMatches, allPsms, parameters, fileSpecificParameters);
AssignEstimatedPsmPepQValue(bestMbrMatches, allPsms);

foreach (SpectralRecoveryPSM match in bestMbrMatches.Values) match.FindOriginalPsm(allPsms);
}

Expand Down Expand Up @@ -208,70 +207,10 @@ private static void FDRAnalysisOfMbrPsms(ConcurrentDictionary<ChromatographicPea
Select(p => p.Value.spectralLibraryMatch).
Where(v => v != null).
ToList();
List<int>[] psmGroupIndices = PEP_Analysis_Cross_Validation.Get_PSM_Group_Indices(psms, 1);
MLContext mlContext = new MLContext();
IEnumerable<PsmData>[] PSMDataGroups = new IEnumerable<PsmData>[1];

string searchType = "standard";
if (psms[0].DigestionParams.Protease.Name == "top-down")
{
searchType = "top-down";
}

int chargeStateMode = PEP_Analysis_Cross_Validation.GetChargeStateMode(allPsms);

Dictionary<string, Dictionary<int, Tuple<double, double>>> fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified = PEP_Analysis_Cross_Validation.ComputeHydrophobicityValues(allPsms, fileSpecificParameters, false);
Dictionary<string, Dictionary<int, Tuple<double, double>>> fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified = PEP_Analysis_Cross_Validation.ComputeHydrophobicityValues(allPsms, fileSpecificParameters, true);
PEP_Analysis_Cross_Validation.ComputeMobilityValues(allPsms, fileSpecificParameters);

Dictionary<string, float> fileSpecificMedianFragmentMassErrors = PEP_Analysis_Cross_Validation.GetFileSpecificMedianFragmentMassError(allPsms);

PSMDataGroups[0] = PEP_Analysis_Cross_Validation.CreatePsmData(searchType, fileSpecificParameters, psms, psmGroupIndices[0], fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode);

string[] trainingVariables = PsmData.trainingInfos[searchType];

TransformerChain<BinaryPredictionTransformer<Microsoft.ML.Calibrators.CalibratedModelParametersBase<Microsoft.ML.Trainers.FastTree.FastTreeBinaryModelParameters, Microsoft.ML.Calibrators.PlattCalibrator>>>[] trainedModels = new TransformerChain<BinaryPredictionTransformer<Microsoft.ML.Calibrators.CalibratedModelParametersBase<Microsoft.ML.Trainers.FastTree.FastTreeBinaryModelParameters, Microsoft.ML.Calibrators.PlattCalibrator>>>[1];

var trainer = mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfTrees: 400);
var pipeline = mlContext.Transforms.Concatenate("Features", trainingVariables).Append(trainer);

IDataView dataView = mlContext.Data.LoadFromEnumerable(PSMDataGroups[0]);

string outputFolder = parameters.OutputFolder;

trainedModels[0] = pipeline.Fit(dataView);

PEP_Analysis_Cross_Validation.Compute_PSM_PEP(psms, psmGroupIndices[0], mlContext, trainedModels[0], searchType, fileSpecificParameters, fileSpecificMedianFragmentMassErrors, chargeStateMode, outputFolder);
}
new FdrAnalysisEngine(psms, parameters.NumNotches, fileSpecificParameters.First().Item2, fileSpecificParameters,
new List<string> { parameters.SearchTaskId }, analysisType: "PSM", doPEP: true, outputFolder: parameters.OutputFolder).Run();

private static void AssignEstimatedPsmPepQValue(ConcurrentDictionary<ChromatographicPeak, SpectralRecoveryPSM> bestMbrMatches, List<SpectralMatch> allPsms)
{
List<double> pepValues = bestMbrMatches.
Select(p => p.Value.spectralLibraryMatch).
Where(p => p != null).
OrderBy(p => p.FdrInfo.PEP).
Select(p => p.FdrInfo.PEP).
ToList();

foreach (SpectralRecoveryPSM match in bestMbrMatches.Values)
{
if (match.spectralLibraryMatch == null) continue;

int myIndex = 0;
while (myIndex < (pepValues.Count - 1) && pepValues[myIndex] <= match.spectralLibraryMatch.FdrInfo.PEP)
{
myIndex++;
}
if (myIndex == pepValues.Count - 1)
{
match.spectralLibraryMatch.FdrInfo.PEP_QValue = pepValues.Last();
}
else
{
double estimatedQ = (pepValues[myIndex - 1] + pepValues[myIndex]) / 2;
match.spectralLibraryMatch.FdrInfo.PEP_QValue = estimatedQ;
}
}
}

private static void WriteSpectralRecoveryPsmResults(ConcurrentDictionary<ChromatographicPeak, SpectralRecoveryPSM> bestMbrMatches, PostSearchAnalysisParameters parameters)
Expand Down
40 changes: 40 additions & 0 deletions MetaMorpheus/TaskLayer/MetaMorpheusTask.cs
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,46 @@ protected List<Protein> LoadProteins(string taskId, List<DbForTask> dbFilenameLi
{
Warn("Warning: " + emptyProteinEntries + " empty protein entries ignored");
}



if (!proteinList.Any(p => p.IsDecoy))
{
Status("Done loading proteins", new List<string> { taskId });
return proteinList;
}

// Sanitize the decoys
// TODO: Fix this so that it accounts for multi-protease searches. Currently, we only consider the first protease
// when looking for target/decoy collisions

HashSet<string> targetPeptideSequences = new();
foreach(var protein in proteinList.Where(p => !p.IsDecoy))
{
// When thinking about decoy collisions, we can ignore modifications
foreach(var peptide in protein.Digest(commonParameters.DigestionParams, new List<Modification>(), new List<Modification>()))
{
targetPeptideSequences.Add(peptide.BaseSequence);
}
}
// Now, we iterate through the decoys and scramble the sequences that correspond to target peptides
for(int i = 0; i < proteinList.Count; i++)
{
if(proteinList[i].IsDecoy)
{
var peptidesToReplace = proteinList[i]
.Digest(commonParameters.DigestionParams, new List<Modification>(), new List<Modification>())
.Select(p => p.BaseSequence)
.Where(targetPeptideSequences.Contains)
.ToList();
if(peptidesToReplace.Any())
{
proteinList[i] = Protein.ScrambleDecoyProteinSequence(proteinList[i], commonParameters.DigestionParams, forbiddenSequences: targetPeptideSequences, peptidesToReplace);
}
}
}

Status("Done loading proteins", new List<string> { taskId });
return proteinList;
}

Expand Down
Loading

0 comments on commit d60e32e

Please sign in to comment.