diff --git a/MetaMorpheus/EngineLayer/CommonParameters.cs b/MetaMorpheus/EngineLayer/CommonParameters.cs index 335749707..f95d6cb12 100644 --- a/MetaMorpheus/EngineLayer/CommonParameters.cs +++ b/MetaMorpheus/EngineLayer/CommonParameters.cs @@ -163,7 +163,7 @@ public int DeconvolutionMaxAssumedChargeState /// This parameter determines which PSMs/Peptides will be used as postive training examples /// when training the GBDT model for PEP. /// - public double QValueCutoffForPepCalculation { get; private set; } + public double QValueCutoffForPepCalculation { get; set; } public DigestionParams DigestionParams { get; private set; } public bool ReportAllAmbiguity { get; private set; } public int? NumberOfPeaksToKeepPerWindow { get; private set; } diff --git a/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs b/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs index 454bcebbc..e75e91b19 100644 --- a/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs +++ b/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs @@ -1,4 +1,5 @@ -using System; +using EngineLayer.CrosslinkSearch; +using System; using System.Collections.Generic; using System.IO; using System.Linq; @@ -275,18 +276,25 @@ public static void PepQValueInverted(List psms, bool peptideLevel public void Compute_PEPValue(FdrAnalysisResults myAnalysisResults, List psms) { - if (psms[0].DigestionParams.Protease.Name == "top-down") + string searchType; + // Currently, searches of mixed data (bottom-up + top-down) are not supported + // PEP will be calculated based on the search type of the first file/PSM in the list, which isn't ideal + // This will be addressed in a future release + switch(psms[0].DigestionParams.Protease.Name) { - myAnalysisResults.BinarySearchTreeMetrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psms, "top-down", this.FileSpecificParameters, this.OutputFolder); + case "top-down": + searchType = "top-down"; + break; + default: + searchType = "standard"; + break; } - else if (psms[0].DigestionParams.Protease.Name == "crosslink") + if (psms[0] is CrosslinkSpectralMatch) { - myAnalysisResults.BinarySearchTreeMetrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psms, "crosslink", this.FileSpecificParameters, this.OutputFolder); - } - else - { - myAnalysisResults.BinarySearchTreeMetrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psms, "standard", this.FileSpecificParameters, this.OutputFolder); + searchType = "crosslink"; } + myAnalysisResults.BinarySearchTreeMetrics = new PepAnalysisEngine(psms, searchType, FileSpecificParameters, OutputFolder).ComputePEPValuesForAllPSMs(); + } /// diff --git a/MetaMorpheus/EngineLayer/FdrAnalysis/PEPValueAnalysisGeneric.cs b/MetaMorpheus/EngineLayer/FdrAnalysis/PEPAnalysisEngine.cs similarity index 66% rename from MetaMorpheus/EngineLayer/FdrAnalysis/PEPValueAnalysisGeneric.cs rename to MetaMorpheus/EngineLayer/FdrAnalysis/PEPAnalysisEngine.cs index 2fa22248f..f68cd3c0a 100644 --- a/MetaMorpheus/EngineLayer/FdrAnalysis/PEPValueAnalysisGeneric.cs +++ b/MetaMorpheus/EngineLayer/FdrAnalysis/PEPAnalysisEngine.cs @@ -16,23 +16,41 @@ using System.Threading.Tasks; using Omics.Modifications; using Omics; +using Easy.Common.Extensions; namespace EngineLayer { - public static class PEP_Analysis_Cross_Validation + public class PepAnalysisEngine { private static readonly double AbsoluteProbabilityThatDistinguishesPeptides = 0.05; - private static Dictionary>> fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified = new Dictionary>>(); - private static Dictionary>> fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified = new Dictionary>>(); - private static Dictionary>> fileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE = new Dictionary>>(); - + + //These two dictionaries contain the average and standard deviations of hydrophobicitys measured in 1 minute increments accross each raw + //file separately. An individully measured hydrobophicty calculated for a specific PSM sequence is compared to these values by computing + //the z-score. That z-score is used as a feature for machine learning. + //Separate dictionaries are created for peptides with modifications because SSRcalc doesn't really do a good job predicting hyrophobicity + + //The first string in the dictionary is the filename + //The value of the dictionary is another dictionary that profiles the hydrophobicity behavior. + //Each key is a retention time rounded to the nearest minute. + //The value Tuple is the average and standard deviation, respectively, of the predicted hydrophobicities of the observed peptides eluting at that rounded retention time. + public Dictionary>> FileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified { get; private set; } + public Dictionary>> FileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified { get; private set; } + public Dictionary>> FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE { get; private set; } + /// /// A dictionary which stores the chimeric ID string in the key and the number of chimeric identifications as the vale /// - private static Dictionary chimeraCountDictionary = new Dictionary(); - public static bool UsePeptideLevelQValueForTraining = true; - public static double QValueCutoff = 0.005; - + private Dictionary chimeraCountDictionary = new Dictionary(); + public Dictionary FileSpecificMedianFragmentMassErrors { get; private set; } + public Dictionary FileSpecificParametersDictionary { get; private set; } + public int ChargeStateMode { get; private set; } + + public double QValueCutoff { get; } + public bool UsePeptideLevelQValueForTraining = true; + public string[] TrainingVariables { get; } + public string OutputFolder { get; } + public List AllPsms { get; } + public string SearchType { get; } /// /// This method is used to compute the PEP values for all PSMs in a dataset. @@ -42,158 +60,254 @@ public static class PEP_Analysis_Cross_Validation /// /// /// - public static string ComputePEPValuesForAllPSMsGeneric(List psms, string searchType, List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters, string outputFolder) + public void SetFileSpecificParameters(List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters) + { + FileSpecificParametersDictionary = fileSpecificParameters.ToDictionary(p => Path.GetFileName(p.fileName), p => p.fileSpecificParameters); + } + + public PepAnalysisEngine(List psms, string searchType, List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters, string outputFolder) + { + // This creates a new list of PSMs, but does not clone the Psms themselves. + // This allows the PSMs to be modified and the order to be preserved + AllPsms = psms.OrderByDescending(p => p).ToList(); + TrainingVariables = PsmData.trainingInfos[searchType]; + OutputFolder = outputFolder; + SearchType = searchType; + SetFileSpecificParameters(fileSpecificParameters); + BuildFileSpecificDictionaries(psms, TrainingVariables); + QValueCutoff = Math.Max(fileSpecificParameters.Select(t => t.fileSpecificParameters.QValueCutoffForPepCalculation).Min(), 0.005); + + // If we have more than 100 peptides, we will train on the peptide level. Otherwise, we will train on the PSM level + UsePeptideLevelQValueForTraining = psms.Select(psm => psm.FullSequence).Distinct().Count(seq => seq.IsNotNullOrEmpty()) >= 100; + } + + public string ComputePEPValuesForAllPSMs() { - string[] trainingVariables = PsmData.trainingInfos[searchType]; - - //ensure that the order is always stable. - psms = psms.OrderByDescending(p => p).ToList(); - List allPeptideIndices = new List(); - List peptides = psms - .GroupBy(b => b.FullSequence) - .Select(b => b.FirstOrDefault()).ToList(); - List countOfPeptidesInEachFile = peptides.GroupBy(b => b.FullFilePath).Select(b => b.Count()).ToList(); - bool allFilesContainPeptides = (countOfPeptidesInEachFile.Count == fileSpecificParameters.Count); //rare condition where each file has psms but some files don't have peptides. probably only happens in unit tests. - UsePeptideLevelQValueForTraining = true; - QValueCutoff = fileSpecificParameters.Select(t => t.fileSpecificParameters.QValueCutoffForPepCalculation).Min(); - - int chargeStateMode = 0; - Dictionary fileSpecificMedianFragmentMassErrors = new Dictionary(); - if (peptides.Count() > 100 && allFilesContainPeptides) + List peptideGroups = UsePeptideLevelQValueForTraining + ? PeptideMatchGroup.GroupByBaseSequence(AllPsms) + : PeptideMatchGroup.GroupByIndividualPsm(AllPsms); + + if(UsePeptideLevelQValueForTraining && (peptideGroups.Count(g => g.BestMatch.IsDecoy) < 4 || peptideGroups.Count(g => !g.BestMatch.IsDecoy) < 4)) { - foreach (var peptide in peptides) + peptideGroups = PeptideMatchGroup.GroupByIndividualPsm(AllPsms); + } + + int numGroups = 4; + List[] peptideGroupIndices = GetPeptideGroupIndices(peptideGroups, numGroups); + IEnumerable[] PSMDataGroups = new IEnumerable[numGroups]; + for (int i = 0; i < numGroups; i++) + { + PSMDataGroups[i] = CreatePsmData(SearchType, peptideGroups, peptideGroupIndices[i]); + + if(!PSMDataGroups[i].Any(p => p.Label) || !PSMDataGroups[i].Any(p => !p.Label)) { - allPeptideIndices.Add(psms.IndexOf(peptide)); + return "Posterior error probability analysis failed. This can occur for small data sets when some sample groups are missing positive or negative training examples."; } - chargeStateMode = GetChargeStateMode(peptides); - fileSpecificMedianFragmentMassErrors = GetFileSpecificMedianFragmentMassError(peptides); } - else + + MLContext mlContext = new MLContext(); + TransformerChain>>[] trainedModels = new TransformerChain>>[numGroups]; + + var trainer = mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfTrees: 400); + var pipeline = mlContext.Transforms.Concatenate("Features", TrainingVariables) + .Append(trainer); + + List allMetrics = new List(); + int sumOfAllAmbiguousPeptidesResolved = 0; + + for (int groupIndexNumber = 0; groupIndexNumber < numGroups; groupIndexNumber++) { - //there are too few psms to do any meaningful training if we used only peptides. So, we will train using psms instead. - UsePeptideLevelQValueForTraining = false; - allPeptideIndices = Enumerable.Range(0, psms.Count).ToList(); - chargeStateMode = GetChargeStateMode(psms); - fileSpecificMedianFragmentMassErrors = GetFileSpecificMedianFragmentMassError(psms); + List allGroupIndexes = Enumerable.Range(0, numGroups).ToList(); + allGroupIndexes.RemoveAt(groupIndexNumber); + + //concat doesn't work in a loop, therefore I had to hard code the concat to group 3 out of 4 lists. if the const int numGroups value is changed, then the concat has to be changed accordingly. + IDataView dataView = mlContext.Data.LoadFromEnumerable(PSMDataGroups[allGroupIndexes[0]].Concat(PSMDataGroups[allGroupIndexes[1]].Concat(PSMDataGroups[allGroupIndexes[2]]))); + trainedModels[groupIndexNumber] = pipeline.Fit(dataView); + var myPredictions = trainedModels[groupIndexNumber].Transform(mlContext.Data.LoadFromEnumerable(PSMDataGroups[groupIndexNumber])); + CalibratedBinaryClassificationMetrics metrics = mlContext.BinaryClassification.Evaluate(data: myPredictions, labelColumnName: "Label", scoreColumnName: "Score"); + + //Parallel operation of the following code requires the method to be stored and then read, once for each thread + //if not output directory is specified, the model cannot be stored, and we must force single-threaded operation + if (OutputFolder != null) + { + mlContext.Model.Save(trainedModels[groupIndexNumber], dataView.Schema, Path.Combine(OutputFolder, "model.zip")); + } + + //model is trained on peptides but here we can use that to compute PEP for all PSMs + int ambiguousPeptidesResolved = Compute_PSM_PEP(peptideGroups, peptideGroupIndices[groupIndexNumber], mlContext, trainedModels[groupIndexNumber], SearchType, OutputFolder); + + allMetrics.Add(metrics); + sumOfAllAmbiguousPeptidesResolved += ambiguousPeptidesResolved; } - - //These two dictionaries contain the average and standard deviations of hydrophobicitys measured in 1 minute increments accross each raw - //file separately. An individully measured hydrobophicty calculated for a specific PSM sequence is compared to these values by computing - //the z-score. That z-score is used as a feature for machine learning. - //Separate dictionaries are created for peptides with modifications because SSRcalc doesn't really do a good job predicting hyrophobicity + return AggregateMetricsForOutput(allMetrics, sumOfAllAmbiguousPeptidesResolved); + } - //The first string in the dictionary is the filename - //The value of the dictionary is another dictionary that profiles the hydrophobicity behavior. - //Each key is a retention time rounded to the nearest minute. - //The value Tuple is the average and standard deviation, respectively, of the predicted hydrophobicities of the observed peptides eluting at that rounded retention time. + /// + /// Sets the following static properties: ChargeStateMode, FileSpecificMedianFragmentMassErrors, FileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, FileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, and FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE + /// + /// The PSMs that will be used for training + /// An array of training variables from PsmData.trainingInfos dictionary + public void BuildFileSpecificDictionaries(List trainingData, string[] trainingVariables) + { + FileSpecificMedianFragmentMassErrors = GetFileSpecificMedianFragmentMassError(trainingData); + ChargeStateMode = GetChargeStateMode(trainingData); if (trainingVariables.Contains("HydrophobicityZScore")) { - if (peptides.Count() > 100 && allFilesContainPeptides) + FileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified = ComputeHydrophobicityValues(trainingData, false); + FileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified = ComputeHydrophobicityValues(trainingData, true); + FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE = ComputeMobilityValues(trainingData); + } + } + + public static List[] GetPeptideGroupIndices(List peptides, int numGroups) + { + List[] groupsOfIndices = new List[numGroups]; + + List targetIndices = new List(); + List decoyIndices = new List(); + for (int i = 0; i < peptides.Count; i++) + { + if (peptides[i].BestMatch.IsDecoy) { - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified = ComputeHydrophobicityValues(peptides, fileSpecificParameters, false); - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified = ComputeHydrophobicityValues(peptides, fileSpecificParameters, true); - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE = ComputeMobilityValues(peptides, fileSpecificParameters); + decoyIndices.Add(i); } else { - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified = ComputeHydrophobicityValues(psms, fileSpecificParameters, false); - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified = ComputeHydrophobicityValues(psms, fileSpecificParameters, true); - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE = ComputeMobilityValues(psms, fileSpecificParameters); + targetIndices.Add(i); } } - if (trainingVariables.Contains("ChimeraCount")) - chimeraCountDictionary = psms.GroupBy(p => p.ChimeraIdString) - .ToDictionary(p => p.Key, p => p.Count()); - - MLContext mlContext = new MLContext(); + var targetIndexGroups = DivideListIntoGroups(targetIndices, numGroups); + var decoyIndexGroups = DivideListIntoGroups(decoyIndices, numGroups); - //the number of groups used for cross-validation is hard-coded at four. Do not change this number without changes other areas of effected code. - int numGroups = 4; - if (psms.Count < 1000 || allPeptideIndices.Count < 500) + for (int i = 0; i < numGroups; i++) { - numGroups = 2; + groupsOfIndices[i] = targetIndexGroups[i].Concat(decoyIndexGroups[i]).ToList(); } - List[] psmGroupIndices = Get_PSM_Group_Indices(psms, numGroups); - //the psms will be randomly divided. but then we want to make another array that just contains the subset of peptides that are in those psms. that way we don't compute pep using any peptides that were used in training. - List[] peptideGroupIndices = Get_Peptide_Group_Indices(psmGroupIndices, allPeptideIndices); - IEnumerable[] PSMDataGroups = new IEnumerable[numGroups]; - + return groupsOfIndices; + } + + /// + /// This takes in a list of ints, and partitions them into numGroups partitions, + /// e.g., partition 1 = [0, 4, 8...], partition 2 = [1, 5, 9...], etc. + /// + /// A list containing numGroups partitions (lists of ints) + static List> DivideListIntoGroups(List list, int numGroups) + { + var groups = new List>(); for (int i = 0; i < numGroups; i++) { - PSMDataGroups[i] = CreatePsmData(searchType, fileSpecificParameters, psms, peptideGroupIndices[i], fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode); + groups.Add(new List()); } - TransformerChain>>[] trainedModels = new TransformerChain>>[numGroups]; - - var trainer = mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfTrees: 400); - var pipeline = mlContext.Transforms.Concatenate("Features", trainingVariables) - .Append(trainer); - - List allMetrics = new List(); - int sumOfAllAmbiguousPeptidesResolved = 0; - - bool allSetsContainPositiveAndNegativeTrainingExamples = true; - int groupNumber = 0; - while (allSetsContainPositiveAndNegativeTrainingExamples == true && groupNumber < numGroups) + int mainIndex = 0; + while (mainIndex < list.Count) { - if (PSMDataGroups[groupNumber].Where(p => p.Label == true).Count() == 0 || PSMDataGroups[groupNumber].Where(p => p.Label == false).Count() == 0) + int subIndex = 0; + while (subIndex < numGroups && mainIndex < list.Count) { - allSetsContainPositiveAndNegativeTrainingExamples = false; + groups[subIndex].Add(list[mainIndex]); + + subIndex++; + mainIndex++; } - groupNumber++; } - if (allSetsContainPositiveAndNegativeTrainingExamples) - { - for (int groupIndexNumber = 0; groupIndexNumber < numGroups; groupIndexNumber++) - { - List allGroupIndexes = Enumerable.Range(0, numGroups).ToList(); - allGroupIndexes.RemoveAt(groupIndexNumber); + return groups; + } - //concat doesn't work in a loop, therefore I had to hard code the concat to group 3 out of 4 lists. if the const int numGroups value is changed, then the concat has to be changed accordingly. - IDataView dataView = mlContext.Data.LoadFromEnumerable(PSMDataGroups[allGroupIndexes[0]]); - if (numGroups > 2) - { - dataView = mlContext.Data.LoadFromEnumerable(PSMDataGroups[allGroupIndexes[0]].Concat(PSMDataGroups[allGroupIndexes[1]].Concat(PSMDataGroups[allGroupIndexes[2]]))); - } - trainedModels[groupIndexNumber] = pipeline.Fit(dataView); - var myPredictions = trainedModels[groupIndexNumber].Transform(mlContext.Data.LoadFromEnumerable(PSMDataGroups[groupIndexNumber])); - CalibratedBinaryClassificationMetrics metrics = mlContext.BinaryClassification.Evaluate(data: myPredictions, labelColumnName: "Label", scoreColumnName: "Score"); + public IEnumerable CreatePsmData(string searchType, + List peptideGroups, List peptideGroupIndices) + { + object psmDataListLock = new object(); + List psmDataList = new List(); + List psmOrder = new List(); + int maxThreads = FileSpecificParametersDictionary.Values.FirstOrDefault().MaxThreadsToUsePerFile; + int[] threads = Enumerable.Range(0, maxThreads).ToArray(); - //Parallel operation of the following code requires the method to be stored and then read, once for each thread - //if not output directory is specified, the model cannot be stored, and we must force single-threaded operation - if (outputFolder != null) + Parallel.ForEach(Partitioner.Create(0, peptideGroupIndices.Count), + new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, + (range, loopState) => + { + List localPsmDataList = new List(); + List localPsmOrder = new List(); + for (int i = range.Item1; i < range.Item2; i++) { - mlContext.Model.Save(trainedModels[groupIndexNumber], dataView.Schema, Path.Combine(outputFolder, "model.zip")); - } + // Stop loop if canceled + if (GlobalVariables.StopLoops) { return; } - //model is trained on peptides but here we can use that to compute PEP for all PSMs - int ambiguousPeptidesResolved = Compute_PSM_PEP(psms, psmGroupIndices[groupIndexNumber], mlContext, trainedModels[groupIndexNumber], searchType, fileSpecificParameters, fileSpecificMedianFragmentMassErrors, chargeStateMode, outputFolder); + int modCount = 0; + foreach (var psm in peptideGroups[peptideGroupIndices[i]].GetBestMatchByMod().Where(psm => psm != null)) + { + PsmData newPsmData = new PsmData(); + if (searchType == "crosslink" && ((CrosslinkSpectralMatch)psm)?.BetaPeptide != null) + { + CrosslinkSpectralMatch csm = (CrosslinkSpectralMatch)psm; - allMetrics.Add(metrics); - sumOfAllAmbiguousPeptidesResolved += ambiguousPeptidesResolved; - } + bool label; + if (csm.IsDecoy || csm.BetaPeptide.IsDecoy) + { + label = false; + newPsmData = CreateOnePsmDataEntry(searchType, csm, csm.BestMatchingBioPolymersWithSetMods.First().Peptide, 0, label); + } + else if (!csm.IsDecoy && !csm.BetaPeptide.IsDecoy && csm.GetFdrInfo(UsePeptideLevelQValueForTraining).QValue <= QValueCutoff) + { + label = true; + newPsmData = CreateOnePsmDataEntry(searchType, csm, csm.BestMatchingBioPolymersWithSetMods.First().Peptide, 0, label); + } + else + { + continue; + } + localPsmDataList.Add(newPsmData); + } + else + { + double bmp = 0; + foreach (var (notch, peptideWithSetMods) in psm.BestMatchingBioPolymersWithSetMods) + { + bool label; + double bmpc = psm.BestMatchingBioPolymersWithSetMods.Count(); + if (peptideWithSetMods.Parent.IsDecoy) + { + label = false; + newPsmData = CreateOnePsmDataEntry(searchType, psm, + peptideWithSetMods, notch, label); + } + else if (!peptideWithSetMods.Parent.IsDecoy + && psm.GetFdrInfo(UsePeptideLevelQValueForTraining).QValue <= QValueCutoff) + { + label = true; + newPsmData = CreateOnePsmDataEntry(searchType, psm, + peptideWithSetMods, notch, label); + } + else + { + continue; + } + localPsmDataList.Add(newPsmData); + localPsmOrder.Add(i + (bmp / bmpc / 2.0)); + bmp += 1.0; + } + } + modCount++; + } + } + lock (psmDataListLock) + { + psmDataList.AddRange(localPsmDataList); + psmOrder.AddRange(localPsmOrder); + } + }); + PsmData[] pda = psmDataList.ToArray(); + double[] order = psmOrder.ToArray(); - return AggregateMetricsForOutput(allMetrics, sumOfAllAmbiguousPeptidesResolved); - } - else - { - return "Posterior error probability analysis failed. This can occur for small data sets when some sample groups are missing positive or negative training examples."; - } - } + Array.Sort(order, pda);//this sorts both arrays thru sorting the array in position one. The order array, keeps track of the positon in the original psms list and returns the PsmData array in that same order. - private static List[] Get_Peptide_Group_Indices(List[] psmGroupIndices, List allPeptideIndices) - { - List[] peptideGroupIndices = new List[psmGroupIndices.Length]; - for (int i = 0; i < psmGroupIndices.Length; i++) - { - peptideGroupIndices[i] = psmGroupIndices[i].Intersect(allPeptideIndices).ToList(); - } - return peptideGroupIndices; + return pda.AsEnumerable(); } public static string AggregateMetricsForOutput(List allMetrics, int sumOfAllAmbiguousPeptidesResolved) @@ -250,9 +364,11 @@ public static string AggregateMetricsForOutput(List psms, List psmIndices, MLContext mLContext, TransformerChain>> trainedModel, string searchType, List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters, Dictionary fileSpecificMedianFragmentMassErrors, int chargeStateMode, string outputFolder) + public int Compute_PSM_PEP(List peptideGroups, + List peptideGroupIndices, + MLContext mLContext, TransformerChain>> trainedModel, string searchType, string outputFolder) { - int maxThreads = fileSpecificParameters.FirstOrDefault().fileSpecificParameters.MaxThreadsToUsePerFile; + int maxThreads = FileSpecificParametersDictionary.Values.FirstOrDefault().MaxThreadsToUsePerFile; object lockObject = new object(); int ambiguousPeptidesResolved = 0; @@ -263,7 +379,7 @@ public static int Compute_PSM_PEP(List psms, List psmIndices maxThreads = 1; } - Parallel.ForEach(Partitioner.Create(0, psmIndices.Count), + Parallel.ForEach(Partitioner.Create(0, peptideGroupIndices.Count), new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, (range, loopState) => { @@ -287,32 +403,38 @@ public static int Compute_PSM_PEP(List psms, List psmIndices for (int i = range.Item1; i < range.Item2; i++) { - SpectralMatch psm = psms[psmIndices[i]]; - - if (psm != null) + foreach (SpectralMatch psm in peptideGroups[peptideGroupIndices[i]]) { - List indiciesOfPeptidesToRemove = new List(); - List pepValuePredictions = new List(); + // I'm not sure what's going one here vis-a-vis disambiguations, but I'm not going to touch it for now + if (psm != null) + { + List indiciesOfPeptidesToRemove = new List(); + List pepValuePredictions = new List(); - //Here we compute the pepvalue predection for each ambiguous peptide in a PSM. Ambiguous peptides with lower pepvalue predictions are removed from the PSM. + //Here we compute the pepvalue predection for each ambiguous peptide in a PSM. Ambiguous peptides with lower pepvalue predictions are removed from the PSM. - List allBmpNotches = new List(); - List allBmpPeptides = new List(); + List allBmpNotches = new List(); + List allBmpPeptides = new List(); - foreach (var (Notch, Peptide) in psm.BestMatchingBioPolymersWithSetMods) - { - allBmpNotches.Add(Notch); - allBmpPeptides.Add(Peptide); - PsmData pd = CreateOnePsmDataEntry(searchType, fileSpecificParameters, psm, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode, Peptide, Notch, !Peptide.Parent.IsDecoy); - var pepValuePrediction = threadPredictionEngine.Predict(pd); - pepValuePredictions.Add(pepValuePrediction.Probability); - //A score is available using the variable pepvaluePrediction.Score - } + foreach (var (Notch, Peptide) in psm.BestMatchingBioPolymersWithSetMods) + { + allBmpNotches.Add(Notch); + allBmpPeptides.Add(Peptide); + PsmData pd = CreateOnePsmDataEntry(searchType, psm, Peptide, Notch, !Peptide.Parent.IsDecoy); + var pepValuePrediction = threadPredictionEngine.Predict(pd); + pepValuePredictions.Add(pepValuePrediction.Probability); + //A score is available using the variable pepvaluePrediction.Score + } - GetIndiciesOfPeptidesToRemove(indiciesOfPeptidesToRemove, pepValuePredictions); - int peptidesRemoved = 0; - RemoveBestMatchingPeptidesWithLowPEP(psm, indiciesOfPeptidesToRemove, allBmpNotches, allBmpPeptides, pepValuePredictions, ref peptidesRemoved); - ambigousPeptidesRemovedinThread += peptidesRemoved; + GetIndiciesOfPeptidesToRemove(indiciesOfPeptidesToRemove, pepValuePredictions); + int peptidesRemoved = 0; + RemoveBestMatchingPeptidesWithLowPEP(psm, indiciesOfPeptidesToRemove, allBmpNotches, allBmpPeptides, pepValuePredictions, ref peptidesRemoved); + ambigousPeptidesRemovedinThread += peptidesRemoved; + + psm.PsmFdrInfo.PEP = 1 - pepValuePredictions.Max(); + psm.PeptideFdrInfo.PEP = 1 - pepValuePredictions.Max(); + } + } } @@ -324,57 +446,192 @@ public static int Compute_PSM_PEP(List psms, List psmIndices return ambiguousPeptidesResolved; } - public static List[] Get_PSM_Group_Indices(List psms, int numGroups) + public PsmData CreateOnePsmDataEntry(string searchType, SpectralMatch psm, IBioPolymerWithSetMods selectedPeptide, int notchToUse, bool label) { - List[] groupsOfIndicies = new List[numGroups]; - var targetIndexes = psms.Select((item, index) => new { Item = item, Index = index }) - .Where(x => !x.Item.IsDecoy) - .Select(x => x.Index) - .ToList(); - RandomizeListInPlace(targetIndexes); - var decoyIndexes = psms.Select((item, index) => new { Item = item, Index = index }) - .Where(x => x.Item.IsDecoy) - .Select(x => x.Index) - .ToList(); - RandomizeListInPlace(decoyIndexes); - - var targetGroups = DivideListIntoGroups(targetIndexes, numGroups); - var decoyGroups = DivideListIntoGroups(decoyIndexes, numGroups); + double normalizationFactor = selectedPeptide.BaseSequence.Length; + float totalMatchingFragmentCount = 0; + float internalMatchingFragmentCount = 0; + float intensity = 0; + float chargeDifference = 0; + float deltaScore = 0; + int notch = 0; + float ambiguity = 0; + float modCount = 0; + float absoluteFragmentMassError = 0; + float spectralAngle = 0; + float hasSpectralAngle = 0; + float chimeraCount = 0; + float peaksInPrecursorEnvelope = 0; + float mostAbundantPrecursorPeakIntensity = 0; + float fractionalIntensity = 0; - for (int i = 0; i < numGroups; i++) + float missedCleavages = 0; + float longestSeq = 0; + float complementaryIonCount = 0; + float hydrophobicityZscore = float.NaN; + bool isVariantPeptide = false; + + //crosslink specific features + float alphaIntensity = 0; + float betaIntensity = 0; + float longestFragmentIonSeries_Alpha = 0; + float longestFragmentIonSeries_Beta = 0; + float isDeadEnd = 0; + float isLoop = 0; + float isInter = 0; + float isIntra = 0; + + double multiplier = 10; + if (searchType != "crosslink") { - groupsOfIndicies[i] = targetGroups[i].Concat(decoyGroups[i]).ToList(); - } + if (searchType == "top-down") + { + normalizationFactor = 1.0; + } + // count only terminal fragment ions + totalMatchingFragmentCount = (float)(Math.Round(psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide].Count(p => p.NeutralTheoreticalProduct.SecondaryProductType == null) / normalizationFactor * multiplier, 0)); + internalMatchingFragmentCount = (float)(Math.Round(psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide].Count(p => p.NeutralTheoreticalProduct.SecondaryProductType != null) / normalizationFactor * multiplier, 0)); + intensity = (float)Math.Min(50, Math.Round((psm.Score - (int)psm.Score) / normalizationFactor * Math.Pow(multiplier, 2), 0)); + chargeDifference = -Math.Abs(ChargeStateMode - psm.ScanPrecursorCharge); + deltaScore = (float)Math.Round(psm.DeltaScore / normalizationFactor * multiplier, 0); + notch = notchToUse; + modCount = Math.Min((float)selectedPeptide.AllModsOneIsNterminus.Keys.Count(), 10); + if (psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide]?.Count() > 0) + { + absoluteFragmentMassError = (float)Math.Min(100.0, Math.Round(10.0 * Math.Abs(GetAverageFragmentMassError(psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide]) - FileSpecificMedianFragmentMassErrors[Path.GetFileName(psm.FullFilePath)]))); + } - return groupsOfIndicies; - } + ambiguity = Math.Min((float)(psm.BioPolymersWithSetModsToMatchingFragments.Keys.Count - 1), 10); + //ambiguity = 10; // I'm pretty sure that you shouldn't train on ambiguity and its skewing the results + longestSeq = (float)Math.Round(SpectralMatch.GetLongestIonSeriesBidirectional(psm.BioPolymersWithSetModsToMatchingFragments, selectedPeptide) / normalizationFactor * multiplier, 0); + complementaryIonCount = (float)Math.Round(SpectralMatch.GetCountComplementaryIons(psm.BioPolymersWithSetModsToMatchingFragments, selectedPeptide) / normalizationFactor * multiplier, 0); + isVariantPeptide = PeptideIsVariant(selectedPeptide); + spectralAngle = (float)psm.SpectralAngle; + if (chimeraCountDictionary.TryGetValue(psm.ChimeraIdString, out int val)) + chimeraCount = val; + peaksInPrecursorEnvelope = psm.PrecursorScanEnvelopePeakCount; + mostAbundantPrecursorPeakIntensity = (float)Math.Round((float)psm.PrecursorScanIntensity / normalizationFactor * multiplier, 0); + fractionalIntensity = (float)psm.PrecursorFractionalIntensity; - static void RandomizeListInPlace(List list) - { - Random rng = new Random(42); - int n = list.Count; - while (n > 1) + if (PsmHasSpectralAngle(psm)) + { + hasSpectralAngle = 1; + } + + if (psm.DigestionParams.Protease.Name != "top-down") + { + missedCleavages = selectedPeptide.MissedCleavages; + bool fileIsCzeSeparationType = FileSpecificParametersDictionary.ContainsKey(Path.GetFileName(psm.FullFilePath)) && FileSpecificParametersDictionary[Path.GetFileName(psm.FullFilePath)].SeparationType == "CZE"; + + if (!fileIsCzeSeparationType) + { + if (selectedPeptide.BaseSequence.Equals(selectedPeptide.FullSequence)) + { + hydrophobicityZscore = (float)Math.Round(GetSSRCalcHydrophobicityZScore(psm, selectedPeptide, FileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified) * 10.0, 0); + } + else + { + hydrophobicityZscore = (float)Math.Round(GetSSRCalcHydrophobicityZScore(psm, selectedPeptide, FileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified) * 10.0, 0); + } + } + else + { + hydrophobicityZscore = (float)Math.Round(GetMobilityZScore(psm, selectedPeptide) * 10.0, 0); + } + } + //this is not for actual crosslinks but for the byproducts of crosslink loop links, deadends, etc. + if (psm is CrosslinkSpectralMatch) + { + CrosslinkSpectralMatch csm = (CrosslinkSpectralMatch)psm; + isDeadEnd = Convert.ToSingle((csm.CrossType == PsmCrossType.DeadEnd) || (csm.CrossType == PsmCrossType.DeadEndH2O) || (csm.CrossType == PsmCrossType.DeadEndNH2) || (csm.CrossType == PsmCrossType.DeadEndTris)); + isLoop = Convert.ToSingle(csm.CrossType == PsmCrossType.Loop); + } + } + else { - n--; - int k = rng.Next(n + 1); - T value = list[k]; - list[k] = list[n]; - list[n] = value; + CrosslinkSpectralMatch csm = (CrosslinkSpectralMatch)psm; + PeptideWithSetModifications selectedAlphaPeptide = csm.BestMatchingBioPolymersWithSetMods.Select(p => p.Peptide as PeptideWithSetModifications).First(); + PeptideWithSetModifications selectedBetaPeptide = csm.BetaPeptide?.BestMatchingBioPolymersWithSetMods.Select(p => p.Peptide as PeptideWithSetModifications).First(); + + float alphaNormalizationFactor = selectedAlphaPeptide.BaseSequence.Length; + float betaNormalizationFactor = selectedBetaPeptide == null ? (float)0 : selectedBetaPeptide.BaseSequence.Length; + float totalNormalizationFactor = alphaNormalizationFactor + betaNormalizationFactor; + + totalMatchingFragmentCount = (float)Math.Round(csm.XLTotalScore / totalNormalizationFactor * 10, 0); + + //Compute fragment mass error + int alphaCount = 0; + float alphaError = 0; + if (csm.BioPolymersWithSetModsToMatchingFragments[selectedAlphaPeptide]?.Count > 0) + { + alphaCount = csm.BioPolymersWithSetModsToMatchingFragments[selectedAlphaPeptide].Count; + alphaError = Math.Abs(GetAverageFragmentMassError(csm.BioPolymersWithSetModsToMatchingFragments[selectedAlphaPeptide])); + } + int betaCount = 0; + float betaError = 0; + if (selectedBetaPeptide != null && csm.BetaPeptide.BioPolymersWithSetModsToMatchingFragments[selectedBetaPeptide]?.Count > 0) + { + betaCount = csm.BetaPeptide.BioPolymersWithSetModsToMatchingFragments[selectedBetaPeptide].Count; + betaError = Math.Abs(GetAverageFragmentMassError(csm.BetaPeptide.BioPolymersWithSetModsToMatchingFragments[selectedBetaPeptide])); + } + + float averageError = 0; + if ((alphaCount + betaCount) > 0) + { + averageError = (alphaCount * alphaError + betaCount * betaError) / (alphaCount + betaCount); + } + + absoluteFragmentMassError = (float)Math.Min(100, Math.Round(averageError - FileSpecificMedianFragmentMassErrors[Path.GetFileName(csm.FullFilePath)] * 10.0, 0)); + //End compute fragment mass error + + deltaScore = (float)Math.Round(csm.DeltaScore / totalNormalizationFactor * 10.0, 0); + chargeDifference = -Math.Abs(ChargeStateMode - psm.ScanPrecursorCharge); + alphaIntensity = (float)Math.Min(100, Math.Round((csm.Score - (int)csm.Score) / alphaNormalizationFactor * 100.0, 0)); + betaIntensity = csm.BetaPeptide == null ? (float)0 : (float)Math.Min(100.0, Math.Round((csm.BetaPeptide.Score - (int)csm.BetaPeptide.Score) / betaNormalizationFactor * 100.0, 0)); + longestFragmentIonSeries_Alpha = (float)Math.Round(SpectralMatch.GetLongestIonSeriesBidirectional(csm.BioPolymersWithSetModsToMatchingFragments, selectedAlphaPeptide) / alphaNormalizationFactor * 10.0, 0); + longestFragmentIonSeries_Beta = selectedBetaPeptide == null ? (float)0 : SpectralMatch.GetLongestIonSeriesBidirectional(csm.BetaPeptide.BioPolymersWithSetModsToMatchingFragments, selectedBetaPeptide) / betaNormalizationFactor; + longestFragmentIonSeries_Beta = (float)Math.Round(longestFragmentIonSeries_Beta * 10.0, 0); + isInter = Convert.ToSingle(csm.CrossType == PsmCrossType.Inter); + isIntra = Convert.ToSingle(csm.CrossType == PsmCrossType.Intra); } - } + psm.PsmData_forPEPandPercolator = new PsmData + { + TotalMatchingFragmentCount = totalMatchingFragmentCount, + Intensity = intensity, + PrecursorChargeDiffToMode = chargeDifference, + DeltaScore = deltaScore, + Notch = notch, + ModsCount = modCount, + AbsoluteAverageFragmentMassErrorFromMedian = absoluteFragmentMassError, + MissedCleavagesCount = missedCleavages, + Ambiguity = ambiguity, + LongestFragmentIonSeries = longestSeq, + ComplementaryIonCount = complementaryIonCount, + HydrophobicityZScore = hydrophobicityZscore, + IsVariantPeptide = Convert.ToSingle(isVariantPeptide), + + AlphaIntensity = alphaIntensity, + BetaIntensity = betaIntensity, + LongestFragmentIonSeries_Alpha = longestFragmentIonSeries_Alpha, + LongestFragmentIonSeries_Beta = longestFragmentIonSeries_Beta, + IsDeadEnd = isDeadEnd, + IsLoop = isLoop, + IsInter = isInter, + IsIntra = isIntra, - static List> DivideListIntoGroups(List list, int n) - { - var groups = new List>(); - int groupSize = (int)Math.Ceiling(list.Count / (double)n); + Label = label, - for (int i = 0; i < n; i++) - { - groups.Add(list.Skip(i * groupSize).Take(groupSize).ToList()); - } + SpectralAngle = spectralAngle, + HasSpectralAngle = hasSpectralAngle, + PeaksInPrecursorEnvelope = peaksInPrecursorEnvelope, + ChimeraCount = chimeraCount, + MostAbundantPrecursorPeakIntensity = mostAbundantPrecursorPeakIntensity, + PrecursorFractionalIntensity = fractionalIntensity, + InternalIonCount = internalMatchingFragmentCount, + }; - return groups; + return psm.PsmData_forPEPandPercolator; } public static void RemoveBestMatchingPeptidesWithLowPEP(SpectralMatch psm, List indiciesOfPeptidesToRemove, List notches, List pwsmList, List pepValuePredictions, ref int ambiguousPeptidesRemovedCount) @@ -384,8 +641,6 @@ public static void RemoveBestMatchingPeptidesWithLowPEP(SpectralMatch psm, List< psm.RemoveThisAmbiguousPeptide(notches[i], pwsmList[i]); ambiguousPeptidesRemovedCount++; } - psm.PsmFdrInfo.PEP = 1 - pepValuePredictions.Max(); - psm.PeptideFdrInfo.PEP = 1 - pepValuePredictions.Max(); } /// @@ -409,22 +664,24 @@ public static void GetIndiciesOfPeptidesToRemove(List indiciesOfPeptidesToR } } + #region Dictionary Builder Functions and Utilities + /// /// Here we're getting the most common charge state for precursors that are Targets with q<=0.01. - public static int GetChargeStateMode(List psms) + public int GetChargeStateMode(List psms) { - return psms.Where(p => p.IsDecoy != true && p.FdrInfo.QValue <= 0.01).Select(p => p.ScanPrecursorCharge).GroupBy(n => n).OrderByDescending(g => g.Count()).Select(g => g.Key).FirstOrDefault(); + return psms.Where(p => p.IsDecoy != true && p.GetFdrInfo(UsePeptideLevelQValueForTraining).QValue <= 0.01).Select(p => p.ScanPrecursorCharge).GroupBy(n => n).OrderByDescending(g => g.Count()).Select(g => g.Key).FirstOrDefault(); } - public static Dictionary>> ComputeHydrophobicityValues(List psms, List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters, bool computeHydrophobicitiesforModifiedPeptides) + public Dictionary>> ComputeHydrophobicityValues(List psms, bool computeHydrophobicitiesforModifiedPeptides) { SSRCalc3 calc = new SSRCalc3("SSRCalc 3.0 (300A)", SSRCalc3.Column.A300); //TODO change the tuple so the values have names Dictionary>> rtHydrophobicityAvgDev = new Dictionary>>(); - List filenames = fileSpecificParameters.Where(s => s.fileSpecificParameters.SeparationType == "HPLC").Select(s => Path.GetFileName(s.fileName)).ToList(); + List filenames = FileSpecificParametersDictionary.Select(kvp => Path.GetFileName(kvp.Key)).ToList(); filenames = filenames.Distinct().ToList(); @@ -522,11 +779,11 @@ public static Dictionary>> Compute return rtHydrophobicityAvgDev; } - public static Dictionary>> ComputeMobilityValues(List psms, List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters) + public Dictionary>> ComputeMobilityValues(List psms) { Dictionary>> rtMobilityAvgDev = new Dictionary>>(); - List filenames = fileSpecificParameters.Where(s => s.fileSpecificParameters.SeparationType == "CZE").Select(s => Path.GetFileName(s.fileName)).ToList(); + List filenames = FileSpecificParametersDictionary.Select(kvp => Path.GetFileName(kvp.Key)).ToList(); filenames = filenames.Distinct().ToList(); @@ -659,18 +916,18 @@ private static float GetSSRCalcHydrophobicityZScore(SpectralMatch psm, IBioPolym return (float)hydrophobicityZscore; } - private static float GetMobilityZScore(SpectralMatch psm, IBioPolymerWithSetMods selectedPeptide) + private float GetMobilityZScore(SpectralMatch psm, IBioPolymerWithSetMods selectedPeptide) { double mobilityZScore = double.NaN; - if (fileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE.ContainsKey(Path.GetFileName(psm.FullFilePath))) + if (FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE.ContainsKey(Path.GetFileName(psm.FullFilePath))) { int time = (int)(2 * Math.Round(psm.ScanRetentionTime / 2d, 0)); - if (fileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE[Path.GetFileName(psm.FullFilePath)].Keys.Contains(time)) + if (FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE[Path.GetFileName(psm.FullFilePath)].Keys.Contains(time)) { double predictedMobility = 100.0 * GetCifuentesMobility(selectedPeptide); - mobilityZScore = Math.Abs(fileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE[Path.GetFileName(psm.FullFilePath)][time].Item1 - predictedMobility) / fileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE[Path.GetFileName(psm.FullFilePath)][time].Item2; + mobilityZScore = Math.Abs(FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE[Path.GetFileName(psm.FullFilePath)][time].Item1 - predictedMobility) / FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE[Path.GetFileName(psm.FullFilePath)][time].Item2; } } @@ -683,287 +940,11 @@ private static float GetMobilityZScore(SpectralMatch psm, IBioPolymerWithSetMods return (float)mobilityZScore; } - public static IEnumerable CreatePsmData(string searchType, List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters, - List psms, List psmIndicies, - Dictionary>> timeDependantHydrophobicityAverageAndDeviation_unmodified, - Dictionary>> timeDependantHydrophobicityAverageAndDeviation_modified, - Dictionary fileSpecificMedianFragmentMassErrors, int chargeStateMode) - { - object psmDataListLock = new object(); - List psmDataList = new List(); - List psmOrder = new List(); - int maxThreads = fileSpecificParameters.FirstOrDefault().fileSpecificParameters.MaxThreadsToUsePerFile; - int[] threads = Enumerable.Range(0, maxThreads).ToArray(); - - Parallel.ForEach(Partitioner.Create(0, psmIndicies.Count), - new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, - (range, loopState) => - { - List localPsmDataList = new List(); - List localPsmOrder = new List(); - for (int i = range.Item1; i < range.Item2; i++) - { - SpectralMatch psm = psms[psmIndicies[i]]; - - // Stop loop if canceled - if (GlobalVariables.StopLoops) { return; } - - PsmData newPsmData = new PsmData(); - if (searchType == "crosslink") - { - CrosslinkSpectralMatch csm = (CrosslinkSpectralMatch)psms[i]; - - bool label; - if (csm.IsDecoy || csm.BetaPeptide.IsDecoy) - { - label = false; - newPsmData = CreateOnePsmDataEntry(searchType, fileSpecificParameters, psm, timeDependantHydrophobicityAverageAndDeviation_unmodified, timeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode, csm.BestMatchingBioPolymersWithSetMods.First().Peptide, 0, label); - } - else if (!csm.IsDecoy && !csm.BetaPeptide.IsDecoy && psm.GetFdrInfo(UsePeptideLevelQValueForTraining).QValue <= QValueCutoff) - { - label = true; - newPsmData = CreateOnePsmDataEntry(searchType, fileSpecificParameters, psm, timeDependantHydrophobicityAverageAndDeviation_unmodified, timeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode, csm.BestMatchingBioPolymersWithSetMods.First().Peptide, 0, label); - } - else - { - continue; - } - localPsmDataList.Add(newPsmData); - localPsmOrder.Add(i); - } - else - { - double bmp = 0; - foreach (var (notch, peptideWithSetMods) in psm.BestMatchingBioPolymersWithSetMods) - { - bool label; - double bmpc = psm.BestMatchingBioPolymersWithSetMods.Count(); - if (peptideWithSetMods.Parent.IsDecoy) - { - label = false; - newPsmData = CreateOnePsmDataEntry(searchType, fileSpecificParameters, psm, timeDependantHydrophobicityAverageAndDeviation_unmodified, timeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode, peptideWithSetMods, notch, label); - } - else if (!peptideWithSetMods.Parent.IsDecoy && psm.GetFdrInfo(UsePeptideLevelQValueForTraining).QValue <= QValueCutoff) - { - label = true; - newPsmData = CreateOnePsmDataEntry(searchType, fileSpecificParameters, psm, timeDependantHydrophobicityAverageAndDeviation_unmodified, timeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode, peptideWithSetMods, notch, label); - } - else - { - continue; - } - localPsmDataList.Add(newPsmData); - localPsmOrder.Add(i + (bmp / bmpc / 2.0)); - bmp += 1.0; - } - } - } - lock (psmDataListLock) - { - psmDataList.AddRange(localPsmDataList); - psmOrder.AddRange(localPsmOrder); - } - }); - PsmData[] pda = psmDataList.ToArray(); - double[] order = psmOrder.ToArray(); - - Array.Sort(order, pda);//this sorts both arrays thru sorting the array in position one. The order array, keeps track of the positon in the original psms list and returns the PsmData array in that same order. - - return pda.AsEnumerable(); - } - - public static PsmData CreateOnePsmDataEntry(string searchType, List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters, SpectralMatch psm, Dictionary>> timeDependantHydrophobicityAverageAndDeviation_unmodified, Dictionary>> timeDependantHydrophobicityAverageAndDeviation_modified, Dictionary fileSpecificMedianFragmentMassErrors, int chargeStateMode, IBioPolymerWithSetMods selectedPeptide, int notchToUse, bool label) - { - double normalizationFactor = selectedPeptide.BaseSequence.Length; - float totalMatchingFragmentCount = 0; - float internalMatchingFragmentCount = 0; - float intensity = 0; - float chargeDifference = 0; - float deltaScore = 0; - int notch = 0; - float ambiguity = 0; - float modCount = 0; - float absoluteFragmentMassError = 0; - float spectralAngle = 0; - float hasSpectralAngle = 0; - float chimeraCount = 0; - float peaksInPrecursorEnvelope = 0; - float mostAbundantPrecursorPeakIntensity = 0; - float fractionalIntensity = 0; - - float missedCleavages = 0; - float longestSeq = 0; - float complementaryIonCount = 0; - float hydrophobicityZscore = float.NaN; - bool isVariantPeptide = false; - - //crosslink specific features - float alphaIntensity = 0; - float betaIntensity = 0; - float longestFragmentIonSeries_Alpha = 0; - float longestFragmentIonSeries_Beta = 0; - float isDeadEnd = 0; - float isLoop = 0; - float isInter = 0; - float isIntra = 0; - - double multiplier = 10; - if (searchType != "crosslink") - { - if (searchType == "top-down") - { - normalizationFactor = 1.0; - } - // count only terminal fragment ions - totalMatchingFragmentCount = (float)(Math.Round(psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide].Count(p => p.NeutralTheoreticalProduct.SecondaryProductType == null) / normalizationFactor * multiplier, 0)); - internalMatchingFragmentCount = (float)(Math.Round(psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide].Count(p => p.NeutralTheoreticalProduct.SecondaryProductType != null) / normalizationFactor * multiplier, 0)); - intensity = (float)Math.Min(50, Math.Round((psm.Score - (int)psm.Score) / normalizationFactor * Math.Pow(multiplier, 2), 0)); - chargeDifference = -Math.Abs(chargeStateMode - psm.ScanPrecursorCharge); - deltaScore = (float)Math.Round(psm.DeltaScore / normalizationFactor * multiplier, 0); - notch = notchToUse; - modCount = Math.Min((float)selectedPeptide.AllModsOneIsNterminus.Keys.Count(), 10); - if (psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide]?.Count() > 0) - { - absoluteFragmentMassError = (float)Math.Min(100.0, Math.Round(10.0 * Math.Abs(GetAverageFragmentMassError(psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide]) - fileSpecificMedianFragmentMassErrors[Path.GetFileName(psm.FullFilePath)]))); - } - - ambiguity = Math.Min((float)(psm.BioPolymersWithSetModsToMatchingFragments.Keys.Count - 1), 10); - longestSeq = (float)Math.Round(SpectralMatch.GetLongestIonSeriesBidirectional(psm.BioPolymersWithSetModsToMatchingFragments, selectedPeptide) / normalizationFactor * multiplier, 0); - complementaryIonCount = (float)Math.Round(SpectralMatch.GetCountComplementaryIons(psm.BioPolymersWithSetModsToMatchingFragments, selectedPeptide) / normalizationFactor * multiplier, 0); - isVariantPeptide = PeptideIsVariant(selectedPeptide); - spectralAngle = (float)psm.SpectralAngle; - if (chimeraCountDictionary.TryGetValue(psm.ChimeraIdString, out int val)) - chimeraCount = val; - peaksInPrecursorEnvelope = psm.PrecursorScanEnvelopePeakCount; - mostAbundantPrecursorPeakIntensity = (float)Math.Round((float)psm.PrecursorScanIntensity / normalizationFactor * multiplier, 0); - fractionalIntensity = (float)psm.PrecursorFractionalIntensity; - - if (PsmHasSpectralAngle(psm)) - { - hasSpectralAngle = 1; - } - - if (psm.DigestionParams.Protease.Name != "top-down") - { - missedCleavages = selectedPeptide.MissedCleavages; - bool fileIsCzeSeparationType = fileSpecificParameters.Any(p => Path.GetFileName(p.fileName) == Path.GetFileName(psm.FullFilePath) && p.fileSpecificParameters.SeparationType == "CZE"); - - if (!fileIsCzeSeparationType) - { - if (selectedPeptide.BaseSequence.Equals(selectedPeptide.FullSequence)) - { - hydrophobicityZscore = (float)Math.Round(GetSSRCalcHydrophobicityZScore(psm, selectedPeptide, timeDependantHydrophobicityAverageAndDeviation_unmodified) * 10.0, 0); - } - else - { - hydrophobicityZscore = (float)Math.Round(GetSSRCalcHydrophobicityZScore(psm, selectedPeptide, timeDependantHydrophobicityAverageAndDeviation_modified) * 10.0, 0); - } - } - else - { - hydrophobicityZscore = (float)Math.Round(GetMobilityZScore(psm, selectedPeptide) * 10.0, 0); - } - } - //this is not for actual crosslinks but for the byproducts of crosslink loop links, deadends, etc. - if (psm is CrosslinkSpectralMatch) - { - CrosslinkSpectralMatch csm = (CrosslinkSpectralMatch)psm; - isDeadEnd = Convert.ToSingle((csm.CrossType == PsmCrossType.DeadEnd) || (csm.CrossType == PsmCrossType.DeadEndH2O) || (csm.CrossType == PsmCrossType.DeadEndNH2) || (csm.CrossType == PsmCrossType.DeadEndTris)); - isLoop = Convert.ToSingle(csm.CrossType == PsmCrossType.Loop); - } - } - else - { - CrosslinkSpectralMatch csm = (CrosslinkSpectralMatch)psm; - PeptideWithSetModifications selectedAlphaPeptide = csm.BestMatchingBioPolymersWithSetMods.Select(p => p.Peptide as PeptideWithSetModifications).First(); - PeptideWithSetModifications selectedBetaPeptide = csm.BetaPeptide?.BestMatchingBioPolymersWithSetMods.Select(p => p.Peptide as PeptideWithSetModifications).First(); - - float alphaNormalizationFactor = selectedAlphaPeptide.BaseSequence.Length; - float betaNormalizationFactor = selectedBetaPeptide == null ? (float)0 : selectedBetaPeptide.BaseSequence.Length; - float totalNormalizationFactor = alphaNormalizationFactor + betaNormalizationFactor; - - totalMatchingFragmentCount = (float)Math.Round(csm.XLTotalScore / totalNormalizationFactor * 10, 0); - - //Compute fragment mass error - int alphaCount = 0; - float alphaError = 0; - if (csm.BioPolymersWithSetModsToMatchingFragments[selectedAlphaPeptide]?.Count > 0) - { - alphaCount = csm.BioPolymersWithSetModsToMatchingFragments[selectedAlphaPeptide].Count; - alphaError = Math.Abs(GetAverageFragmentMassError(csm.BioPolymersWithSetModsToMatchingFragments[selectedAlphaPeptide])); - } - int betaCount = 0; - float betaError = 0; - if (csm.BetaPeptide.BioPolymersWithSetModsToMatchingFragments[selectedBetaPeptide]?.Count > 0) - { - betaCount = csm.BetaPeptide.BioPolymersWithSetModsToMatchingFragments[selectedBetaPeptide].Count; - betaError = Math.Abs(GetAverageFragmentMassError(csm.BetaPeptide.BioPolymersWithSetModsToMatchingFragments[selectedBetaPeptide])); - } - - float averageError = 0; - if ((alphaCount + betaCount) > 0) - { - averageError = (alphaCount * alphaError + betaCount * betaError) / (alphaCount + betaCount); - } - - absoluteFragmentMassError = (float)Math.Min(100, Math.Round(averageError - fileSpecificMedianFragmentMassErrors[Path.GetFileName(csm.FullFilePath)] * 10.0, 0)); - //End compute fragment mass error - - deltaScore = (float)Math.Round(csm.DeltaScore / totalNormalizationFactor * 10.0, 0); - chargeDifference = -Math.Abs(chargeStateMode - psm.ScanPrecursorCharge); - alphaIntensity = (float)Math.Min(100, Math.Round((csm.Score - (int)csm.Score) / alphaNormalizationFactor * 100.0, 0)); - betaIntensity = csm.BetaPeptide == null ? (float)0 : (float)Math.Min(100.0, Math.Round((csm.BetaPeptide.Score - (int)csm.BetaPeptide.Score) / betaNormalizationFactor * 100.0, 0)); - longestFragmentIonSeries_Alpha = (float)Math.Round(SpectralMatch.GetLongestIonSeriesBidirectional(csm.BioPolymersWithSetModsToMatchingFragments, selectedAlphaPeptide) / alphaNormalizationFactor * 10.0, 0); - longestFragmentIonSeries_Beta = selectedBetaPeptide == null ? (float)0 : SpectralMatch.GetLongestIonSeriesBidirectional(csm.BetaPeptide.BioPolymersWithSetModsToMatchingFragments, selectedBetaPeptide) / betaNormalizationFactor; - longestFragmentIonSeries_Beta = (float)Math.Round(longestFragmentIonSeries_Beta * 10.0, 0); - isInter = Convert.ToSingle(csm.CrossType == PsmCrossType.Inter); - isIntra = Convert.ToSingle(csm.CrossType == PsmCrossType.Intra); - } - - psm.PsmData_forPEPandPercolator = new PsmData - { - TotalMatchingFragmentCount = totalMatchingFragmentCount, - Intensity = intensity, - PrecursorChargeDiffToMode = chargeDifference, - DeltaScore = deltaScore, - Notch = notch, - ModsCount = modCount, - AbsoluteAverageFragmentMassErrorFromMedian = absoluteFragmentMassError, - MissedCleavagesCount = missedCleavages, - Ambiguity = ambiguity, - LongestFragmentIonSeries = longestSeq, - ComplementaryIonCount = complementaryIonCount, - HydrophobicityZScore = hydrophobicityZscore, - IsVariantPeptide = Convert.ToSingle(isVariantPeptide), - - AlphaIntensity = alphaIntensity, - BetaIntensity = betaIntensity, - LongestFragmentIonSeries_Alpha = longestFragmentIonSeries_Alpha, - LongestFragmentIonSeries_Beta = longestFragmentIonSeries_Beta, - IsDeadEnd = isDeadEnd, - IsLoop = isLoop, - IsInter = isInter, - IsIntra = isIntra, - - Label = label, - - SpectralAngle = spectralAngle, - HasSpectralAngle = hasSpectralAngle, - PeaksInPrecursorEnvelope = peaksInPrecursorEnvelope, - ChimeraCount = chimeraCount, - MostAbundantPrecursorPeakIntensity = mostAbundantPrecursorPeakIntensity, - PrecursorFractionalIntensity = fractionalIntensity, - InternalIonCount = internalMatchingFragmentCount, - }; - - return psm.PsmData_forPEPandPercolator; - } - private static bool PeptideIsVariant(IBioPolymerWithSetMods bpwsm) { - if (bpwsm is not PeptideWithSetModifications pwsm) + if (bpwsm is not PeptideWithSetModifications pwsm) return false; - + bool identifiedVariant = false; if (pwsm.Protein.AppliedSequenceVariations.Count() > 0) { @@ -984,7 +965,7 @@ private static bool PsmHasSpectralAngle(SpectralMatch psm) return psm.SpectralAngle >= 0; } - public static bool ContainsModificationsThatShiftMobility(IEnumerable modifications) + public static bool ContainsModificationsThatShiftMobility(IEnumerable modifications) { List shiftingModifications = new List { "Acetylation", "Ammonia loss", "Carbamyl", "Deamidation", "Formylation", "N2-acetylarginine", "N6-acetyllysine", "N-acetylalanine", "N-acetylaspartate", "N-acetylcysteine", "N-acetylglutamate", "N-acetylglycine", @@ -1053,5 +1034,7 @@ public static float GetAverageFragmentMassError(IEnumerable return massErrors.Average(); } + + #endregion } } \ No newline at end of file diff --git a/MetaMorpheus/EngineLayer/FdrAnalysis/PeptideMatchGroup.cs b/MetaMorpheus/EngineLayer/FdrAnalysis/PeptideMatchGroup.cs new file mode 100644 index 000000000..b88faa9d1 --- /dev/null +++ b/MetaMorpheus/EngineLayer/FdrAnalysis/PeptideMatchGroup.cs @@ -0,0 +1,70 @@ +using Omics; +using Proteomics.ProteolyticDigestion; +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace EngineLayer +{ + public class PeptideMatchGroup : IEnumerable + { + public string PeptideFullSequence { get; } + public List SpectralMatches { get; } + + /// + /// This class groups all spectral matches associated with a given peptide together, + /// to facilitate the calculation of PEP values. + /// + /// The full sequence to be used for grouping + /// Every spectral match that matches the full sequence + public PeptideMatchGroup(string fullPeptideSeq, List spectralMatches) + { + PeptideFullSequence = fullPeptideSeq; + SpectralMatches = spectralMatches; + } + + public static List GroupByBaseSequence(List spectralMatches) + { + // This groups psms by base sequence, ensuring that PSMs with the same base sequence but different modifications are grouped together when training. + + // TODO: Determine if it's better to group PSMs by base sequence or by full sequence. + return spectralMatches.GroupBy(p => p.BaseSequence) + .Select(group => new PeptideMatchGroup(group.Key, group.ToList())) + .OrderByDescending(matchGroup => matchGroup.Count()) + .ThenByDescending(matchGroup => matchGroup.BestMatch.Score) + .ToList(); + } + + public IEnumerable GetBestMatchByMod() + { + return SpectralMatches.GroupBy(p => p.FullSequence).Select(g => g.MaxBy(p => p)); + } + + /// + /// This function is called if there aren't enough peptides to train at the peptide level + /// + /// + /// + public static List GroupByIndividualPsm(List spectralMatches) + { + return spectralMatches.Select(psm => new PeptideMatchGroup(psm.FullSequence, new List { psm })) + .ToList(); + } + + public SpectralMatch BestMatch => SpectralMatches.MaxBy(match => match); + + public IEnumerator GetEnumerator() + { + return SpectralMatches.GetEnumerator(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + + } +} \ No newline at end of file diff --git a/MetaMorpheus/EngineLayer/ProteinScoringAndFdr/FdrCategory.cs b/MetaMorpheus/EngineLayer/ProteinScoringAndFdr/FdrCategory.cs index 47ad22412..cabfde29f 100644 --- a/MetaMorpheus/EngineLayer/ProteinScoringAndFdr/FdrCategory.cs +++ b/MetaMorpheus/EngineLayer/ProteinScoringAndFdr/FdrCategory.cs @@ -6,6 +6,25 @@ namespace EngineLayer { + /// + /// This enum is used to categorize the FDR of a peptide based on its cleavage specificity. + /// FullySpecific: The peptide is cleaved only at protease-specified cleavage sites. + /// SemiSpecific: The peptide is cleaved on one terminus at protease-specified cleavage sites and at non-specific site on the other terminus. + /// NonSpecific: The peptide is cleaved at non-specific sites on both termini. + /// + /// In the Speedy Non-Specific Search use case, all three categories are used with modern search. For each spectrum, the lowest q-value peptide is chosen + /// rather than the highest scoring peptide. + /// + /// In a classic NonSpecific search, I believe that only the NonSpecific category is used. Further, I believe that it includes peptides that are cleaved + /// at one or more protease-specified cleavage sites, but also at non-specific sites. + /// + /// The Single-N or Single-C protease is a special case. The modern search table is populated only with peptide fragments including the specified terminus. + /// Fragments from the other terminus are not included. + /// + /// This is not the same as Semi-Trypsin, which is a classic search where the protein is digested into peptides and then the database is further updated + /// the full set of peptides that could be generated by terminal degradation. + /// + /// public enum FdrCategory { //Cleavage Specificity diff --git a/MetaMorpheus/EngineLayer/SpectralMatch.cs b/MetaMorpheus/EngineLayer/SpectralMatch.cs index 96ef0f644..299f13b00 100644 --- a/MetaMorpheus/EngineLayer/SpectralMatch.cs +++ b/MetaMorpheus/EngineLayer/SpectralMatch.cs @@ -197,7 +197,7 @@ public void ResolveAllAmbiguities() ModsChemicalFormula = PsmTsvWriter.Resolve(_BestMatchingBioPolymersWithSetMods.Select(b => b.Pwsm.AllModsOneIsNterminus.Select(c => (c.Value)))).ResolvedValue; Notch = PsmTsvWriter.Resolve(_BestMatchingBioPolymersWithSetMods.Select(b => b.Notch)).ResolvedValue; - // if the PSM matches a target and a decoy and they are the SAME SEQUENCE, remove the decoy + //if the PSM matches a target and a decoy and they are the SAME SEQUENCE, remove the decoy if (IsDecoy) { bool removedPeptides = false; diff --git a/MetaMorpheus/TaskLayer/FilteredPsms.cs b/MetaMorpheus/TaskLayer/FilteredPsms.cs index 869a477a5..81e80c0fa 100644 --- a/MetaMorpheus/TaskLayer/FilteredPsms.cs +++ b/MetaMorpheus/TaskLayer/FilteredPsms.cs @@ -8,6 +8,12 @@ namespace TaskLayer { + public enum FilterType + { + QValue, + PepQValue + } + /// /// Contains a filtered list of PSMs. /// All properties within this class are read-only, and should only be set on object construction @@ -18,11 +24,11 @@ public class FilteredPsms : IEnumerable /// /// Filter type can have only two values: "q-value" or "pep q-value" /// - public string FilterType { get; init; } + public FilterType FilterType { get; init; } public double FilterThreshold { get; init; } public bool FilteringNotPerformed { get; init; } public bool PeptideLevelFiltering { get; init; } - public FilteredPsms(List filteredPsms, string filterType, double filterThreshold, bool filteringNotPerformed, bool peptideLevelFiltering) + public FilteredPsms(List filteredPsms, FilterType filterType, double filterThreshold, bool filteringNotPerformed, bool peptideLevelFiltering) { FilteredPsmsList = filteredPsms; FilterType = filterType; @@ -37,13 +43,18 @@ private bool AboveThreshold(SpectralMatch psm) switch (FilterType) { - case "pep q-value": + case FilterType.PepQValue: return psm.GetFdrInfo(PeptideLevelFiltering).PEP_QValue <= FilterThreshold; default: return psm.GetFdrInfo(PeptideLevelFiltering).QValue <= FilterThreshold && psm.GetFdrInfo(PeptideLevelFiltering).QValueNotch <= FilterThreshold; } } + public string GetFilterTypeString() + { + return FilterType == FilterType.PepQValue ? "pep q-value" : "q-value"; + } + /// /// This method should only be called when filtered PSMs are modified for the purpose of SILAC analysis /// @@ -87,7 +98,7 @@ public static FilteredPsms Filter(IEnumerable psms, List filteredPsms = new List(); // set the filter type - string filterType = "q-value"; + FilterType filterType = FilterType.QValue; if (pepQValueThreshold < qValueThreshold) { if (psms.Count() < 100) @@ -97,13 +108,13 @@ public static FilteredPsms Filter(IEnumerable psms, } else { - filterType = "pep q-value"; + filterType = FilterType.PepQValue; } } if (!includeHighQValuePsms) { - filteredPsms = filterType.Equals("q-value") + filteredPsms = filterType.Equals(FilterType.QValue) ? psms.Where(p => p.GetFdrInfo(filterAtPeptideLevel) != null && p.GetFdrInfo(filterAtPeptideLevel).QValue <= filterThreshold && p.GetFdrInfo(filterAtPeptideLevel).QValueNotch <= filterThreshold).ToList() diff --git a/MetaMorpheus/TaskLayer/MbrAnalysis/SpectralRecoveryRunner.cs b/MetaMorpheus/TaskLayer/MbrAnalysis/SpectralRecoveryRunner.cs index a963eefbf..eae34b3b8 100644 --- a/MetaMorpheus/TaskLayer/MbrAnalysis/SpectralRecoveryRunner.cs +++ b/MetaMorpheus/TaskLayer/MbrAnalysis/SpectralRecoveryRunner.cs @@ -119,9 +119,8 @@ public static SpectralRecoveryResults RunSpectralRecoveryAlgorithm( List allPsms = parameters.AllPsms. OrderByDescending(p => p).ToList(); - AssignEstimatedPsmQvalue(bestMbrMatches, allPsms); FDRAnalysisOfMbrPsms(bestMbrMatches, allPsms, parameters, fileSpecificParameters); - AssignEstimatedPsmPepQValue(bestMbrMatches, allPsms); + foreach (SpectralRecoveryPSM match in bestMbrMatches.Values) match.FindOriginalPsm(allPsms); } @@ -208,70 +207,10 @@ private static void FDRAnalysisOfMbrPsms(ConcurrentDictionary p.Value.spectralLibraryMatch). Where(v => v != null). ToList(); - List[] psmGroupIndices = PEP_Analysis_Cross_Validation.Get_PSM_Group_Indices(psms, 1); - MLContext mlContext = new MLContext(); - IEnumerable[] PSMDataGroups = new IEnumerable[1]; - - string searchType = "standard"; - if (psms[0].DigestionParams.Protease.Name == "top-down") - { - searchType = "top-down"; - } - - int chargeStateMode = PEP_Analysis_Cross_Validation.GetChargeStateMode(allPsms); - - Dictionary>> fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified = PEP_Analysis_Cross_Validation.ComputeHydrophobicityValues(allPsms, fileSpecificParameters, false); - Dictionary>> fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified = PEP_Analysis_Cross_Validation.ComputeHydrophobicityValues(allPsms, fileSpecificParameters, true); - PEP_Analysis_Cross_Validation.ComputeMobilityValues(allPsms, fileSpecificParameters); - - Dictionary fileSpecificMedianFragmentMassErrors = PEP_Analysis_Cross_Validation.GetFileSpecificMedianFragmentMassError(allPsms); - - PSMDataGroups[0] = PEP_Analysis_Cross_Validation.CreatePsmData(searchType, fileSpecificParameters, psms, psmGroupIndices[0], fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode); - string[] trainingVariables = PsmData.trainingInfos[searchType]; - - TransformerChain>>[] trainedModels = new TransformerChain>>[1]; - - var trainer = mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfTrees: 400); - var pipeline = mlContext.Transforms.Concatenate("Features", trainingVariables).Append(trainer); - - IDataView dataView = mlContext.Data.LoadFromEnumerable(PSMDataGroups[0]); - - string outputFolder = parameters.OutputFolder; - - trainedModels[0] = pipeline.Fit(dataView); - - PEP_Analysis_Cross_Validation.Compute_PSM_PEP(psms, psmGroupIndices[0], mlContext, trainedModels[0], searchType, fileSpecificParameters, fileSpecificMedianFragmentMassErrors, chargeStateMode, outputFolder); - } + new FdrAnalysisEngine(psms, parameters.NumNotches, fileSpecificParameters.First().Item2, fileSpecificParameters, + new List { parameters.SearchTaskId }, analysisType: "PSM", doPEP: true, outputFolder: parameters.OutputFolder).Run(); - private static void AssignEstimatedPsmPepQValue(ConcurrentDictionary bestMbrMatches, List allPsms) - { - List pepValues = bestMbrMatches. - Select(p => p.Value.spectralLibraryMatch). - Where(p => p != null). - OrderBy(p => p.FdrInfo.PEP). - Select(p => p.FdrInfo.PEP). - ToList(); - - foreach (SpectralRecoveryPSM match in bestMbrMatches.Values) - { - if (match.spectralLibraryMatch == null) continue; - - int myIndex = 0; - while (myIndex < (pepValues.Count - 1) && pepValues[myIndex] <= match.spectralLibraryMatch.FdrInfo.PEP) - { - myIndex++; - } - if (myIndex == pepValues.Count - 1) - { - match.spectralLibraryMatch.FdrInfo.PEP_QValue = pepValues.Last(); - } - else - { - double estimatedQ = (pepValues[myIndex - 1] + pepValues[myIndex]) / 2; - match.spectralLibraryMatch.FdrInfo.PEP_QValue = estimatedQ; - } - } } private static void WriteSpectralRecoveryPsmResults(ConcurrentDictionary bestMbrMatches, PostSearchAnalysisParameters parameters) diff --git a/MetaMorpheus/TaskLayer/MetaMorpheusTask.cs b/MetaMorpheus/TaskLayer/MetaMorpheusTask.cs index 0a0f89e35..7def49d61 100644 --- a/MetaMorpheus/TaskLayer/MetaMorpheusTask.cs +++ b/MetaMorpheus/TaskLayer/MetaMorpheusTask.cs @@ -622,6 +622,46 @@ protected List LoadProteins(string taskId, List dbFilenameLi { Warn("Warning: " + emptyProteinEntries + " empty protein entries ignored"); } + + + + if (!proteinList.Any(p => p.IsDecoy)) + { + Status("Done loading proteins", new List { taskId }); + return proteinList; + } + + // Sanitize the decoys + // TODO: Fix this so that it accounts for multi-protease searches. Currently, we only consider the first protease + // when looking for target/decoy collisions + + HashSet targetPeptideSequences = new(); + foreach(var protein in proteinList.Where(p => !p.IsDecoy)) + { + // When thinking about decoy collisions, we can ignore modifications + foreach(var peptide in protein.Digest(commonParameters.DigestionParams, new List(), new List())) + { + targetPeptideSequences.Add(peptide.BaseSequence); + } + } + // Now, we iterate through the decoys and scramble the sequences that correspond to target peptides + for(int i = 0; i < proteinList.Count; i++) + { + if(proteinList[i].IsDecoy) + { + var peptidesToReplace = proteinList[i] + .Digest(commonParameters.DigestionParams, new List(), new List()) + .Select(p => p.BaseSequence) + .Where(targetPeptideSequences.Contains) + .ToList(); + if(peptidesToReplace.Any()) + { + proteinList[i] = Protein.ScrambleDecoyProteinSequence(proteinList[i], commonParameters.DigestionParams, forbiddenSequences: targetPeptideSequences, peptidesToReplace); + } + } + } + + Status("Done loading proteins", new List { taskId }); return proteinList; } diff --git a/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs b/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs index 41cb7692a..69e73a402 100644 --- a/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs +++ b/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs @@ -620,7 +620,7 @@ private void WritePsmResults() "PEP could not be calculated due to an insufficient number of PSMs. Results were filtered by q-value." + Environment.NewLine); } - string psmResultsText = "All target PSMs with " + psmsForPsmResults.FilterType + " <= " + Math.Round(psmsForPsmResults.FilterThreshold, 2) + ": " + + string psmResultsText = "All target PSMs with " + psmsForPsmResults.GetFilterTypeString() + " <= " + Math.Round(psmsForPsmResults.FilterThreshold, 2) + ": " + psmsForPsmResults.TargetPsmsAboveThreshold; ResultsDictionary[("All", "PSMs")] = psmResultsText; } @@ -647,7 +647,7 @@ private void WritePeptideResults() Parameters.SearchTaskResults.AddPsmPeptideProteinSummaryText( "PEP could not be calculated due to an insufficient number of PSMs. Results were filtered by q-value." + Environment.NewLine); } - string peptideResultsText = $"All target {GlobalVariables.AnalyteType.ToLower()}s with " + peptidesForPeptideResults.FilterType + " <= " + Math.Round(peptidesForPeptideResults.FilterThreshold, 2) + ": " + + string peptideResultsText = $"All target {GlobalVariables.AnalyteType.ToLower()}s with " + peptidesForPeptideResults.GetFilterTypeString() + " <= " + Math.Round(peptidesForPeptideResults.FilterThreshold, 2) + ": " + peptidesForPeptideResults.TargetPsmsAboveThreshold; ResultsDictionary[("All", GlobalVariables.AnalyteType)] = peptideResultsText; } @@ -684,7 +684,7 @@ private void WriteIndividualPsmResults() FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId, "Individual Spectra Files", psmFileGroup.Key }); // write summary text - string psmResultsText = strippedFileName + " - Target PSMs with " + psmsToWrite.FilterType + " <= " + Math.Round(psmsToWrite.FilterThreshold, 2) + ": " + + string psmResultsText = strippedFileName + " - Target PSMs with " + psmsToWrite.GetFilterTypeString() + " <= " + Math.Round(psmsToWrite.FilterThreshold, 2) + ": " + psmsToWrite.TargetPsmsAboveThreshold; ResultsDictionary[(strippedFileName, "PSMs")] = psmResultsText; } @@ -720,7 +720,7 @@ private void WriteIndividualPeptideResults() FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId, "Individual Spectra Files", psmFileGroup.Key }); // write summary text - string peptideResultsText = strippedFileName + $" - Target {GlobalVariables.AnalyteType.ToLower()}s with " + peptidesToWrite.FilterType + " <= " + Math.Round(peptidesToWrite.FilterThreshold, 2) + ": " + + string peptideResultsText = strippedFileName + $" - Target {GlobalVariables.AnalyteType.ToLower()}s with " + peptidesToWrite.GetFilterTypeString() + " <= " + Math.Round(peptidesToWrite.FilterThreshold, 2) + ": " + peptidesToWrite.TargetPsmsAboveThreshold; ResultsDictionary[(strippedFileName, GlobalVariables.AnalyteType)] = peptideResultsText; } @@ -746,7 +746,6 @@ private void UpdateSpectralLibrary() // Value is the highest scoring psm in the group elementSelector: g => g.MaxBy(p => p.Score)); - //load the original library var originalLibrarySpectra = Parameters.SpectralLibrary.GetAllLibrarySpectra(); List updatedLibrarySpectra = new(); diff --git a/MetaMorpheus/TaskLayer/SearchTask/SearchTask.cs b/MetaMorpheus/TaskLayer/SearchTask/SearchTask.cs index 5fb800513..6329d27a3 100644 --- a/MetaMorpheus/TaskLayer/SearchTask/SearchTask.cs +++ b/MetaMorpheus/TaskLayer/SearchTask/SearchTask.cs @@ -199,7 +199,7 @@ protected override MyTaskResults RunSpecific(string OutputFolder, List { taskId } ); Status("Searching files...", new List { taskId, "Individual Spectra Files" }); Dictionary numMs2SpectraPerFile = new Dictionary(); diff --git a/MetaMorpheus/Test/EverythingRunnerEngineTestCase.cs b/MetaMorpheus/Test/EverythingRunnerEngineTestCase.cs index 699b4fe67..a907b3f5d 100644 --- a/MetaMorpheus/Test/EverythingRunnerEngineTestCase.cs +++ b/MetaMorpheus/Test/EverythingRunnerEngineTestCase.cs @@ -160,8 +160,7 @@ static EverythingRunnerEngineTestCase() myTomlPath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\Task2-SearchTaskconfig.toml"); searchTaskLoaded = Toml.ReadFile(myTomlPath, MetaMorpheusTask.tomlConfig); - // TODO: Uncomment this line and change values for PR 2394 - //searchTaskLoaded.CommonParameters.QValueCutoffForPepCalculation = 0.01; + searchTaskLoaded.CommonParameters.QValueCutoffForPepCalculation = 0.01; _cases.Add(EverythingRunnerEngineTestCases.BottomUpPepQValue, new EverythingRunnerEngineTestCase(EverythingRunnerEngineTestCases.BottomUpPepQValue, new List<(string, MetaMorpheusTask)> { ("postSearchAnalysisTaskTestOutput", searchTaskLoaded) }, diff --git a/MetaMorpheus/Test/FdrTest.cs b/MetaMorpheus/Test/FdrTest.cs index 645d7dd60..73bb1af82 100644 --- a/MetaMorpheus/Test/FdrTest.cs +++ b/MetaMorpheus/Test/FdrTest.cs @@ -19,6 +19,11 @@ using TaskLayer; using UsefulProteomicsDatabases; using Omics; +using Org.BouncyCastle.Utilities.Collections; +using OxyPlot; +using static iText.Svg.SvgConstants; +using System.Reflection; +using UsefulProteomicsDatabases.Generated; namespace Test { @@ -32,18 +37,18 @@ public static void TestSeeModsThatShiftMobility() Modification am = new Modification(_originalId: "Ammonia loss"); List real = new List { ac, am }; - Assert.IsTrue(PEP_Analysis_Cross_Validation.ContainsModificationsThatShiftMobility(real)); - Assert.AreEqual(2, PEP_Analysis_Cross_Validation.CountModificationsThatShiftMobility(real)); + Assert.IsTrue(PepAnalysisEngine.ContainsModificationsThatShiftMobility(real)); + Assert.AreEqual(2, PepAnalysisEngine.CountModificationsThatShiftMobility(real)); Modification fac = new Modification(_originalId: "fake Acetylation"); Modification fam = new Modification(_originalId: "fake Ammonia loss"); List fake = new List { fac, fam }; - Assert.IsFalse(PEP_Analysis_Cross_Validation.ContainsModificationsThatShiftMobility(fake)); - Assert.AreEqual(0, PEP_Analysis_Cross_Validation.CountModificationsThatShiftMobility(fake)); + Assert.IsFalse(PepAnalysisEngine.ContainsModificationsThatShiftMobility(fake)); + Assert.AreEqual(0, PepAnalysisEngine.CountModificationsThatShiftMobility(fake)); - Assert.IsTrue(PEP_Analysis_Cross_Validation.ContainsModificationsThatShiftMobility(real.Concat(fake))); - Assert.AreEqual(2, PEP_Analysis_Cross_Validation.CountModificationsThatShiftMobility(real.Concat(fake))); + Assert.IsTrue(PepAnalysisEngine.ContainsModificationsThatShiftMobility(real.Concat(fake))); + Assert.AreEqual(2, PepAnalysisEngine.CountModificationsThatShiftMobility(real.Concat(fake))); } [Test] @@ -178,6 +183,7 @@ public static void TestComputePEPValue() Dictionary sequenceToPsmCount = new Dictionary(); + List sequences = new List(); foreach (SpectralMatch psm in nonNullPsms) { @@ -212,7 +218,31 @@ public static void TestComputePEPValue() { Path.GetFileName(maxScorePsm.FullFilePath), 0 } }; - var maxPsmData = PEP_Analysis_Cross_Validation.CreateOnePsmDataEntry("standard", fsp, maxScorePsm, fileSpecificRetTimeHI_behavior, fileSpecificRetTemHI_behaviorModifiedPeptides, massError, chargeStateMode, pwsm, notch, !pwsm.Parent.IsDecoy); + // Set values within PEP_Analysis through reflection + PepAnalysisEngine pepEngine = new PepAnalysisEngine(nonNullPsms, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + var pepEngineProperties = pepEngine.GetType().GetProperties(); + foreach (var p in pepEngineProperties) + { + switch(p.Name) + { + case "FileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified": + p.SetValue(pepEngine, fileSpecificRetTimeHI_behavior); + break; + case "FileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified": + p.SetValue(pepEngine, fileSpecificRetTimeHI_behavior); + break; + case "ChargeStateMode": + p.SetValue(pepEngine, chargeStateMode); + break; + case "FileSpecificMedianFragmentMassErrors": + p.SetValue(pepEngine, massError); + break; + default: + break; + } + } + + var maxPsmData = pepEngine.CreateOnePsmDataEntry("standard", maxScorePsm, pwsm, notch, !pwsm.Parent.IsDecoy); Assert.That(maxScorePsm.BioPolymersWithSetModsToMatchingFragments.Count - 1, Is.EqualTo(maxPsmData.Ambiguity)); double normalizationFactor = (double)pwsm.BaseSequence.Length; float maxPsmDeltaScore = (float)Math.Round(maxScorePsm.DeltaScore / normalizationFactor * 10.0, 0); @@ -230,7 +260,7 @@ public static void TestComputePEPValue() List psmCopyForPEPFailure = nonNullPsms.ToList(); List psmCopyForNoOutputFolder = nonNullPsms.ToList(); - PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(nonNullPsms, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + pepEngine.ComputePEPValuesForAllPSMs(); int trueCount = 0; @@ -253,7 +283,9 @@ public static void TestComputePEPValue() } } - string metrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(moreNonNullPSMs, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + + pepEngine = new PepAnalysisEngine(moreNonNullPSMs, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + string metrics = pepEngine.ComputePEPValuesForAllPSMs(); Assert.GreaterOrEqual(32, trueCount); //Test Variant Peptide as Input is identified as such as part of PEP calculation input much of the next several lines simply necessry to create a psm. @@ -286,18 +318,33 @@ public static void TestComputePEPValue() var (vnotch, vpwsm) = variantPSM.BestMatchingBioPolymersWithSetMods.First(); massError.Add(Path.GetFileName(variantPSM.FullFilePath), 0); - PsmData variantPsmData = PEP_Analysis_Cross_Validation.CreateOnePsmDataEntry("standard", fsp, variantPSM, fileSpecificRetTimeHI_behavior, fileSpecificRetTemHI_behaviorModifiedPeptides, massError, chargeStateMode, vpwsm, vnotch, !maxScorePsm.IsDecoy); + + // edit the FileSpecificMedianFragmentMassErrors property of PEP_Analysis_Cross_Validation to include the mass error for the variant peptide file + pepEngineProperties = pepEngine.GetType().GetProperties(); + foreach (var p in pepEngineProperties) + { + switch (p.Name) + { + case "FileSpecificMedianFragmentMassErrors": + p.SetValue(pepEngine, massError); + break; + default: + break; + } + } + + + PsmData variantPsmData = pepEngine.CreateOnePsmDataEntry("standard", variantPSM, vpwsm, vnotch, !maxScorePsm.IsDecoy); Assert.AreEqual((float)1, variantPsmData.IsVariantPeptide); //TEST CZE - fsp = new List<(string fileName, CommonParameters fileSpecificParameters)>(); var cp = new CommonParameters(separationType: "CZE"); fsp.Add((origDataFile, cp)); - PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psmCopyForCZETest, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + trueCount = 0; foreach (var item in psmCopyForCZETest.Where(p => p != null)) @@ -318,18 +365,22 @@ public static void TestComputePEPValue() moreNonNullPSMsCZE.Add(psm); } } - metrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(moreNonNullPSMsCZE, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + + pepEngine = new PepAnalysisEngine(moreNonNullPSMsCZE, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + metrics = pepEngine.ComputePEPValuesForAllPSMs(); Assert.GreaterOrEqual(32, trueCount); //TEST PEP calculation failure psmCopyForPEPFailure.RemoveAll(x => x.IsDecoy); - string result = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psmCopyForPEPFailure, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + pepEngine = new PepAnalysisEngine(psmCopyForPEPFailure, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + string result = pepEngine.ComputePEPValuesForAllPSMs(); Assert.AreEqual("Posterior error probability analysis failed. This can occur for small data sets when some sample groups are missing positive or negative training examples.", result); //Run PEP with no output folder; //There is no assertion here. We simply want to show that PEP calculation does not fail with null folder. string outputFolder = null; - string nullOutputFolderResults = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psmCopyForNoOutputFolder, "standard", fsp, outputFolder); + pepEngine = new PepAnalysisEngine(psmCopyForNoOutputFolder, "standard", fsp, outputFolder); + string nullOutputFolderResults = pepEngine.ComputePEPValuesForAllPSMs(); } [Test] @@ -404,7 +455,26 @@ public static void TestComputePEPValueTopDown() { { Path.GetFileName(maxScorePsm.FullFilePath), 0 } }; - var maxPsmData = PEP_Analysis_Cross_Validation.CreateOnePsmDataEntry("top-down", fsp, maxScorePsm, fileSpecificRetTimeHI_behavior, fileSpecificRetTemHI_behaviorModifiedPeptides, massError, chargeStateMode, pwsm, notch, !pwsm.Parent.IsDecoy); + + // Set values within PEP_Analysis through reflection + PepAnalysisEngine pepEngine = new PepAnalysisEngine(nonNullPsms, "top-down", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + var pepEngineProperties = pepEngine.GetType().GetProperties(); + foreach (var p in pepEngineProperties) + { + switch (p.Name) + { + case "ChargeStateMode": + p.SetValue(pepEngine, chargeStateMode); + break; + case "FileSpecificMedianFragmentMassErrors": + p.SetValue(pepEngine, massError); + break; + default: + break; + } + } + + var maxPsmData = pepEngine.CreateOnePsmDataEntry("top-down", maxScorePsm, pwsm, notch, !pwsm.Parent.IsDecoy); Assert.That(maxScorePsm.BioPolymersWithSetModsToMatchingFragments.Count - 1, Is.EqualTo(maxPsmData.Ambiguity)); double normalizationFactor = 1; float maxPsmDeltaScore = (float)Math.Round(maxScorePsm.DeltaScore / normalizationFactor * 10.0, 0); @@ -442,7 +512,7 @@ public static void TestPEP_peptideRemoval() List<(int notch, PeptideWithSetModifications pwsm)> bestMatchingPeptidesToRemove = new List<(int notch, PeptideWithSetModifications pwsm)>(); List pepValuePredictions = new List { 1.0d, 0.99d, 0.9d }; - PEP_Analysis_Cross_Validation.GetIndiciesOfPeptidesToRemove(indiciesOfPeptidesToRemove, pepValuePredictions); + PepAnalysisEngine.GetIndiciesOfPeptidesToRemove(indiciesOfPeptidesToRemove, pepValuePredictions); Assert.AreEqual(1, indiciesOfPeptidesToRemove.Count); Assert.AreEqual(2, indiciesOfPeptidesToRemove.FirstOrDefault()); Assert.AreEqual(2, pepValuePredictions.Count); @@ -455,7 +525,7 @@ public static void TestPEP_peptideRemoval() peptides.Add(bmp.Peptide); } - PEP_Analysis_Cross_Validation.RemoveBestMatchingPeptidesWithLowPEP(psm, indiciesOfPeptidesToRemove, notches, peptides, pepValuePredictions, ref ambiguousPeptidesRemovedCount); + PepAnalysisEngine.RemoveBestMatchingPeptidesWithLowPEP(psm, indiciesOfPeptidesToRemove, notches, peptides, pepValuePredictions, ref ambiguousPeptidesRemovedCount); Assert.AreEqual(1, ambiguousPeptidesRemovedCount); Assert.AreEqual(2, psm.BestMatchingBioPolymersWithSetMods.Select(b => b.Notch).ToList().Count); } @@ -472,13 +542,13 @@ public static void TestPEP_standardDeviationsToChange() averagesCommaStandardDeviations.Add(2, new Tuple(1.0d, 1.1d));//will NOT get removed becuase its perfectly fine averagesCommaStandardDeviations.Add(3, new Tuple(1.0d, 10.0d));//will get removed becuase its too big - PEP_Analysis_Cross_Validation.GetStDevsToChange(stDevsToChange, averagesCommaStandardDeviations, globalStDev); + PepAnalysisEngine.GetStDevsToChange(stDevsToChange, averagesCommaStandardDeviations, globalStDev); Assert.That(stDevsToChange.ContainsKey(0)); Assert.That(stDevsToChange.ContainsKey(1)); Assert.That(stDevsToChange.ContainsKey(3)); Assert.AreEqual(3, stDevsToChange.Keys.Count); - PEP_Analysis_Cross_Validation.UpdateOutOfRangeStDevsWithGlobalAverage(stDevsToChange, averagesCommaStandardDeviations); + PepAnalysisEngine.UpdateOutOfRangeStDevsWithGlobalAverage(stDevsToChange, averagesCommaStandardDeviations); Assert.AreEqual(1.0d, averagesCommaStandardDeviations[0].Item2); Assert.AreEqual(1.0d, averagesCommaStandardDeviations[1].Item2); diff --git a/MetaMorpheus/Test/PeptideSpectralMatchTest.cs b/MetaMorpheus/Test/PeptideSpectralMatchTest.cs index 9e6c9dcdf..a5f097ed5 100644 --- a/MetaMorpheus/Test/PeptideSpectralMatchTest.cs +++ b/MetaMorpheus/Test/PeptideSpectralMatchTest.cs @@ -33,11 +33,11 @@ public static void GetAminoAcidCoverageTest() int missedCleavages = 0; CleavageSpecificity cleavageSpecificity = CleavageSpecificity.Full; string peptideDescription = null; - string? pairedTargetDecoyHash = null; + string pairedTargetDecoySequence = null; PeptideWithSetModifications pwsmNoBaseSequence = new(sequence, allKnownMods, numFixedMods, digestionParams, myProtein, oneBasedStartResidueInProtein, oneBasedEndResidueInProtein, missedCleavages, cleavageSpecificity, - peptideDescription, pairedTargetDecoyHash); + peptideDescription, pairedTargetDecoySequence); PeptideSpectralMatch psmNoBaseSequenceNoMFI = new(pwsmNoBaseSequence, 0, 10, 0, ms2ScanOneMzTen, commonParams, new List()); psmNoBaseSequenceNoMFI.ResolveAllAmbiguities(); @@ -52,9 +52,10 @@ public static void GetAminoAcidCoverageTest() sequence = "PEPTIDE"; oneBasedEndResidueInProtein = Math.Max(sequence.Length, 0); myProtein = new Protein(sequence, "ACCESSION"); - PeptideWithSetModifications pwsmBaseSequence = new(sequence, allKnownMods, numFixedMods, digestionParams, myProtein, + var test = new PeptideWithSetModifications(sequence, allKnownMods); + PeptideWithSetModifications pwsmBaseSequence = new PeptideWithSetModifications(sequence, allKnownMods, numFixedMods, digestionParams, myProtein, oneBasedStartResidueInProtein, oneBasedEndResidueInProtein, missedCleavages, cleavageSpecificity, - peptideDescription, pairedTargetDecoyHash); + peptideDescription, pairedTargetDecoySequence); PeptideSpectralMatch psmBaseSequenceNoMFI = new(pwsmBaseSequence, 0, 10, 0, ms2ScanOneMzTen, commonParams, new List()); diff --git a/MetaMorpheus/Test/PostSearchAnalysisTaskTests.cs b/MetaMorpheus/Test/PostSearchAnalysisTaskTests.cs index 5426e1bbf..f01117297 100644 --- a/MetaMorpheus/Test/PostSearchAnalysisTaskTests.cs +++ b/MetaMorpheus/Test/PostSearchAnalysisTaskTests.cs @@ -78,27 +78,27 @@ public static void AllResultsAndResultsTxtContainsCorrectValues_PepQValue_Bottom string outputFolder = testCase.OutputDirectory; var allResultsFile = Path.Combine(outputFolder, "allResults.txt"); var allResults = File.ReadAllLines(allResultsFile); - Assert.AreEqual("All target PSMs with pep q-value <= 0.01: 420", allResults[10]); - Assert.AreEqual("All target peptides with pep q-value <= 0.01: 172", allResults[11]); - Assert.AreEqual("All target protein groups with q-value <= 0.01 (1% FDR): 155", allResults[12]); - Assert.AreEqual("TaGe_SA_A549_3_snip - Target PSMs with pep q-value <= 0.01: 210", allResults[14]); - Assert.AreEqual("TaGe_SA_A549_3_snip - Target peptides with pep q-value <= 0.01: 172", allResults[15]); - Assert.AreEqual("TaGe_SA_A549_3_snip - Target protein groups within 1 % FDR: 155", allResults[16]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target PSMs with pep q-value <= 0.01: 210", allResults[18]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target peptides with pep q-value <= 0.01: 172", allResults[19]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target protein groups within 1 % FDR: 155", allResults[20]); + Assert.AreEqual("All target PSMs with pep q-value <= 0.01: 382", allResults[10]); + Assert.AreEqual("All target peptides with pep q-value <= 0.01: 153", allResults[11]); + Assert.AreEqual("All target protein groups with q-value <= 0.01 (1% FDR): 140", allResults[12]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target PSMs with pep q-value <= 0.01: 190", allResults[14]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target peptides with pep q-value <= 0.01: 153", allResults[15]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target protein groups within 1 % FDR: 140", allResults[16]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target PSMs with pep q-value <= 0.01: 190", allResults[18]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target peptides with pep q-value <= 0.01: 153", allResults[19]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target protein groups within 1 % FDR: 140", allResults[20]); var resultsFile = Path.Combine(outputFolder, @"postSearchAnalysisTaskTestOutput\results.txt"); var results = File.ReadAllLines(resultsFile); - Assert.AreEqual("All target PSMs with pep q-value <= 0.01: 420", results[5]); - Assert.AreEqual("All target peptides with pep q-value <= 0.01: 172", results[6]); - Assert.AreEqual("All target protein groups with q-value <= 0.01 (1% FDR): 155", results[7]); - Assert.AreEqual("TaGe_SA_A549_3_snip - Target PSMs with pep q-value <= 0.01: 210", results[9]); - Assert.AreEqual("TaGe_SA_A549_3_snip - Target peptides with pep q-value <= 0.01: 172", results[10]); - Assert.AreEqual("TaGe_SA_A549_3_snip - Target protein groups within 1 % FDR: 155", results[11]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target PSMs with pep q-value <= 0.01: 210", results[13]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target peptides with pep q-value <= 0.01: 172", results[14]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target protein groups within 1 % FDR: 155", results[15]); + Assert.AreEqual("All target PSMs with pep q-value <= 0.01: 382", results[5]); + Assert.AreEqual("All target peptides with pep q-value <= 0.01: 153", results[6]); + Assert.AreEqual("All target protein groups with q-value <= 0.01 (1% FDR): 140", results[7]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target PSMs with pep q-value <= 0.01: 190", results[9]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target peptides with pep q-value <= 0.01: 153", results[10]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target protein groups within 1 % FDR: 140", results[11]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target PSMs with pep q-value <= 0.01: 190", results[13]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target peptides with pep q-value <= 0.01: 153", results[14]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target protein groups within 1 % FDR: 140", results[15]); } /// diff --git a/MetaMorpheus/Test/SearchEngineTests.cs b/MetaMorpheus/Test/SearchEngineTests.cs index 9053a1bb1..fc4d3c62f 100644 --- a/MetaMorpheus/Test/SearchEngineTests.cs +++ b/MetaMorpheus/Test/SearchEngineTests.cs @@ -74,6 +74,7 @@ public static void TestSearchEngineResultsPsmFromTsv() string myFile = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\TaGe_SA_A549_3_snip.mzML"); string myDatabase = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\TaGe_SA_A549_3_snip.fasta"); + searchTaskLoaded.CommonParameters.QValueCutoffForPepCalculation = 0.01; var engineToml = new EverythingRunnerEngine(new List<(string, MetaMorpheusTask)> { ("SearchTOML", searchTaskLoaded) }, new List { myFile }, new List { new DbForTask(myDatabase, false) }, outputFolder); engineToml.Run(); diff --git a/MetaMorpheus/Test/SpectralRecoveryTest.cs b/MetaMorpheus/Test/SpectralRecoveryTest.cs index 3257361c6..c69e03aa2 100644 --- a/MetaMorpheus/Test/SpectralRecoveryTest.cs +++ b/MetaMorpheus/Test/SpectralRecoveryTest.cs @@ -1,7 +1,8 @@ using EngineLayer; using EngineLayer.ClassicSearch; using MassSpectrometry; -using NUnit.Framework; using Assert = NUnit.Framework.Legacy.ClassicAssert; +using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Proteomics; using Proteomics.ProteolyticDigestion; using System; @@ -17,6 +18,8 @@ using UsefulProteomicsDatabases; using Nett; using System.DirectoryServices; +using System.Threading.Tasks; +using System.Threading; namespace Test { @@ -314,7 +317,7 @@ public static void SpectralWriterTest() QuantifyPpmTol = 25 } }, - CommonParameters = new CommonParameters(dissociationType: DissociationType.Autodetect), + CommonParameters = new CommonParameters(dissociationType: DissociationType.Autodetect, qValueCutoffForPepCalculation: 0.01), FileSpecificParameters = new List<(string FileName, CommonParameters Parameters)> { (rawSlices[0], new CommonParameters()), (rawSlices[1], new CommonParameters()) @@ -333,13 +336,16 @@ public static void SpectralWriterTest() testLibraryWithoutDecoy.CloseConnections(); + // Get rid of this file so it doesn't interfere with the next test + File.Delete(Path.Combine(path, matchingvalue)); + // new task with less than 100 psms. postSearchTask = new PostSearchAnalysisTask() { Parameters = new PostSearchAnalysisParameters() { ProteinList = proteinList, - AllPsms = psms.GetRange(0, 50), + AllPsms = psms.GetRange(0, 80), CurrentRawFileList = rawSlices, DatabaseFilenameList = databaseList, OutputFolder = outputFolder, @@ -360,32 +366,35 @@ public static void SpectralWriterTest() QuantifyPpmTol = 25 } }, - CommonParameters = new CommonParameters(dissociationType: DissociationType.Autodetect), + CommonParameters = new CommonParameters(dissociationType: DissociationType.Autodetect, qValueCutoffForPepCalculation: 0.01), FileSpecificParameters = new List<(string FileName, CommonParameters Parameters)> { (rawSlices[0], new CommonParameters()), (rawSlices[1], new CommonParameters()) } }; - postSearchTask.Run(); + // Find and open the new spectral library + list = Directory.GetFiles(path, "*.*", SearchOption.AllDirectories); + matchingvalue = list.Where(p => p.Contains("SpectralLibrary")).First().ToString(); testLibraryWithoutDecoy = new SpectralLibrary(new List { Path.Combine(path, matchingvalue) }); - Assert.That(testLibraryWithoutDecoy.TryGetSpectrum("EESGKPGAHVTVK", 2, out spectrum)); + // When writing a new spectral library, we don't want it to have the exact same name as the old one. + // So, we make sure at least one second has passed + Thread.Sleep(new TimeSpan(0, 0, 1)); // Wait for the library to close + // Test spectral library update postSearchTask.Parameters.SearchParameters.UpdateSpectralLibrary = true; postSearchTask.Parameters.SpectralLibrary = testLibraryWithoutDecoy; postSearchTask.Run(); - var libraryList = Directory.GetFiles(path, "*.*", SearchOption.AllDirectories); string updateLibraryPath = libraryList.First(p => p.Contains("updateSpectralLibrary") && !p.Contains(matchingvalue)).ToString(); var updatedLibraryWithoutDecoy = new SpectralLibrary(new List { Path.Combine(path, updateLibraryPath) }); Assert.That(updatedLibraryWithoutDecoy.TryGetSpectrum("EESGKPGAHVTVK", 2, out spectrum)); - testLibraryWithoutDecoy.CloseConnections(); + testLibraryWithoutDecoy.CloseConnections(); updatedLibraryWithoutDecoy.CloseConnections(); - } [Test] diff --git a/MetaMorpheus/Test/XLTest.cs b/MetaMorpheus/Test/XLTest.cs index 229189c3b..5de9a1e65 100644 --- a/MetaMorpheus/Test/XLTest.cs +++ b/MetaMorpheus/Test/XLTest.cs @@ -417,14 +417,14 @@ public static void XlTest_MoreComprehensive() } MyFileManager myFileManager = new MyFileManager(true); - CommonParameters CommonParameters = new CommonParameters(digestionParams: new DigestionParams(), maxThreadsToUsePerFile: 1); + CommonParameters commonParameters2 = new CommonParameters(digestionParams: new DigestionParams(), maxThreadsToUsePerFile: 1); var fsp = new List<(string fileName, CommonParameters fileSpecificParameters)>(); - fsp.Add((Path.GetFileName(newFileName), CommonParameters)); + fsp.Add((Path.GetFileName(newFileName), commonParameters2)); - var myMsDataFile = myFileManager.LoadFile(newFileName, CommonParameters); + var myMsDataFile = myFileManager.LoadFile(newFileName, commonParameters2); - Ms2ScanWithSpecificMass[] listOfSortedms2Scans = MetaMorpheusTask.GetMs2ScansWrapByScanNum(myMsDataFile, newFileName, CommonParameters, out List> precursorss).ToArray(); + Ms2ScanWithSpecificMass[] listOfSortedms2Scans = MetaMorpheusTask.GetMs2ScansWrapByScanNum(myMsDataFile, newFileName, commonParameters2, out List> precursorss).ToArray(); //Generate crosslinker, which is DSS here. Crosslinker crosslinker = GlobalVariables.Crosslinkers.Where(p => p.CrosslinkerName == "DSS").First(); @@ -529,9 +529,45 @@ public static void XlTest_MoreComprehensive() Assert.AreEqual(0, deadendTris); Assert.AreEqual(0, unnasignedCrossType); - var fdrResultsXLink = new FdrAnalysisEngine(firstCsmsFromListsOfCsms.Where(c => c.CrossType == PsmCrossType.Inter || c.CrossType == PsmCrossType.Intra).ToList(), 1, CommonParameters, fsp, new List(), "crosslink").Run(); + // We have pretty high peptide-level q values for crosslinks, so we need to up the cut-off is we want PEP to run + commonParameters2.QValueCutoffForPepCalculation = 0.05; + var fdrResultsXLink = new FdrAnalysisEngine(firstCsmsFromListsOfCsms.Where(c => c.CrossType == PsmCrossType.Inter || c.CrossType == PsmCrossType.Intra).ToList(), 1, commonParameters2, fsp, new List(), "crosslink").Run(); - fdrResultsXLink = new FdrAnalysisEngine(firstCsmsFromListsOfCsms.Where(c => c.CrossType != PsmCrossType.Inter && c.CrossType != PsmCrossType.Intra).ToList(), 1, CommonParameters, fsp, new List(), "standard").Run(); + unnasignedCrossType = 0; + inter = 0; + intra = 0; + single = 0; + loop = 0; + deadend = 0; + deadendH2O = 0; + deadendNH2 = 0; + deadendTris = 0; + + foreach (CrosslinkSpectralMatch csm in firstCsmsFromListsOfCsms.Where(c => (c.CrossType == PsmCrossType.Inter || c.CrossType == PsmCrossType.Intra) && c.FdrInfo.PEP_QValue <= 0.02).ToList()) + { + switch (csm.CrossType) + { + case PsmCrossType.Inter: + inter++; + break; + + case PsmCrossType.Intra: + intra++; + break; + + default: + unnasignedCrossType++; + break; + } + } + + Assert.AreEqual(47, inter); + Assert.AreEqual(73, intra); + Assert.AreEqual(0, unnasignedCrossType); + + + // We have pretty high peptide-level q values for crosslinks, so we need to up the cut-off is we want PEP to run + fdrResultsXLink = new FdrAnalysisEngine(firstCsmsFromListsOfCsms.Where(c => c.CrossType != PsmCrossType.Inter && c.CrossType != PsmCrossType.Intra).ToList(), 1, commonParameters2, fsp, new List(), "standard").Run(); unnasignedCrossType = 0; inter = 0; @@ -634,7 +670,26 @@ public static void XlTest_MoreComprehensive() { Path.GetFileName(intraCsm.FullFilePath), 0 } }; - var intraPsmData = PEP_Analysis_Cross_Validation.CreateOnePsmDataEntry("crosslink", fsp, intraCsm, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, medianFragmentMassError, chargeStateMode, intraCsm.BestMatchingBioPolymersWithSetMods.First().Peptide, intraCsm.BestMatchingBioPolymersWithSetMods.First().Notch, !intraCsm.BestMatchingBioPolymersWithSetMods.First().Peptide.Parent.IsDecoy); + // Set values within PEP_Analysis through reflection + + PepAnalysisEngine pepEngine = new PepAnalysisEngine(new List(firstCsmsFromListsOfCsms), "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + var pepEngineProperties = pepEngine.GetType().GetProperties(); + foreach (var p in pepEngineProperties) + { + switch (p.Name) + { + case "ChargeStateMode": + p.SetValue(pepEngine, chargeStateMode); + break; + case "FileSpecificMedianFragmentMassErrors": + p.SetValue(pepEngine, medianFragmentMassError); + break; + default: + break; + } + } + + var intraPsmData = pepEngine.CreateOnePsmDataEntry("crosslink", intraCsm, intraCsm.BestMatchingBioPolymersWithSetMods.First().Peptide, intraCsm.BestMatchingBioPolymersWithSetMods.First().Notch, !intraCsm.BestMatchingBioPolymersWithSetMods.First().Peptide.Parent.IsDecoy); Assert.That(intraPsmData.AbsoluteAverageFragmentMassErrorFromMedian, Is.EqualTo(1.0).Within(0.1)); Assert.That(intraPsmData.AlphaIntensity, Is.EqualTo(1).Within(0.1)); Assert.AreEqual(intraPsmData.Ambiguity, 0); @@ -659,16 +714,9 @@ public static void XlTest_MoreComprehensive() CrosslinkSpectralMatch singleCsm = firstCsmsFromListsOfCsms.Where(c => c.CrossType == PsmCrossType.Single).OrderBy(c => -c.Score).First(); - List psms = new List(); - psms.AddRange(firstCsmsFromListsOfCsms); - - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified = PEP_Analysis_Cross_Validation.ComputeHydrophobicityValues(psms, fsp, false); - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified = PEP_Analysis_Cross_Validation.ComputeHydrophobicityValues(psms, fsp, true); - - var singleCsmPsmData = PEP_Analysis_Cross_Validation.CreateOnePsmDataEntry("standard", fsp, singleCsm, - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, medianFragmentMassError, - chargeStateMode, singleCsm.BestMatchingBioPolymersWithSetMods.FirstOrDefault().Peptide, + var singleCsmPsmData = pepEngine.CreateOnePsmDataEntry("standard", + singleCsm, + singleCsm.BestMatchingBioPolymersWithSetMods.FirstOrDefault().Peptide, singleCsm.BestMatchingBioPolymersWithSetMods.FirstOrDefault().Notch, !singleCsm.BestMatchingBioPolymersWithSetMods.FirstOrDefault().Peptide.Parent.IsDecoy); Assert.That(singleCsmPsmData.AbsoluteAverageFragmentMassErrorFromMedian, Is.EqualTo(8).Within(0.1)); @@ -695,7 +743,7 @@ public static void XlTest_MoreComprehensive() Assert.That(singleCsmPsmData.TotalMatchingFragmentCount, Is.EqualTo(8).Within(0.1)); CrosslinkSpectralMatch loopCsm = firstCsmsFromListsOfCsms.Where(c => c.CrossType == PsmCrossType.Loop).OrderBy(c => -c.Score).First(); - var loopCsmPsmData = PEP_Analysis_Cross_Validation.CreateOnePsmDataEntry("standard", fsp, loopCsm, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, medianFragmentMassError, chargeStateMode, loopCsm.BestMatchingBioPolymersWithSetMods.First().Peptide, loopCsm.BestMatchingBioPolymersWithSetMods.First().Notch, !loopCsm.BestMatchingBioPolymersWithSetMods.First().Peptide.Parent.IsDecoy); Assert.That(loopCsmPsmData.AbsoluteAverageFragmentMassErrorFromMedian, Is.EqualTo(6).Within(0.1)); + var loopCsmPsmData = pepEngine.CreateOnePsmDataEntry("standard", loopCsm, loopCsm.BestMatchingBioPolymersWithSetMods.First().Peptide, loopCsm.BestMatchingBioPolymersWithSetMods.First().Notch, !loopCsm.BestMatchingBioPolymersWithSetMods.First().Peptide.Parent.IsDecoy); Assert.That(loopCsmPsmData.AbsoluteAverageFragmentMassErrorFromMedian, Is.EqualTo(6).Within(0.1)); Assert.AreEqual(loopCsmPsmData.AlphaIntensity, 0); Assert.AreEqual(loopCsmPsmData.Ambiguity, 0); Assert.AreEqual(loopCsmPsmData.BetaIntensity, 0);