From 7353259d2acc40e209ae7f63f4442d69c324c0fb Mon Sep 17 00:00:00 2001 From: nbollis Date: Thu, 19 Sep 2024 15:04:20 -0500 Subject: [PATCH 01/17] Added in base classes --- mzLib/Chemistry/ClassExtensions.cs | 1 + .../Enums/DissociationType.cs | 5 + mzLib/MzLibUtil/ClassExtensions.cs | 2 +- .../Fragmentation/FragmentationTerminus.cs | 19 +- .../Oligo/DissociationTypeCollection.cs | 3 +- .../Digestion/NucleolyticOligo.cs | 168 +++++++++ .../Digestion/OligoWithSetMods.cs | 336 +++++++++++++++++ .../Digestion/RnaDigestionParams.cs | 51 +++ mzLib/Transcriptomics/Digestion/Rnase.cs | 60 ++- .../Interfaces/INucleicAcid.cs | 2 +- mzLib/Transcriptomics/NucleicAcid.cs | 356 ++++++++++++++++++ mzLib/Transcriptomics/RNA.cs | 51 +++ mzLib/mzLib.sln.DotSettings | 5 +- 13 files changed, 1037 insertions(+), 22 deletions(-) create mode 100644 mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs create mode 100644 mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs create mode 100644 mzLib/Transcriptomics/Digestion/RnaDigestionParams.cs create mode 100644 mzLib/Transcriptomics/NucleicAcid.cs create mode 100644 mzLib/Transcriptomics/RNA.cs diff --git a/mzLib/Chemistry/ClassExtensions.cs b/mzLib/Chemistry/ClassExtensions.cs index 8bb5aecdc..7093e1f5f 100644 --- a/mzLib/Chemistry/ClassExtensions.cs +++ b/mzLib/Chemistry/ClassExtensions.cs @@ -48,6 +48,7 @@ public static double ToMass(this double massToChargeRatio, int charge) return Math.Abs(charge) * massToChargeRatio - charge * Constants.ProtonMass; } + public static double? RoundedDouble(this double myNumber, int places = 9) => RoundedDouble(myNumber as double?, places); public static double? RoundedDouble(this double? myNumber, int places = 9) { if (myNumber != null) diff --git a/mzLib/MassSpectrometry/Enums/DissociationType.cs b/mzLib/MassSpectrometry/Enums/DissociationType.cs index 1ac136197..ca738b3fa 100644 --- a/mzLib/MassSpectrometry/Enums/DissociationType.cs +++ b/mzLib/MassSpectrometry/Enums/DissociationType.cs @@ -109,6 +109,11 @@ public enum DissociationType /// LowCID, + /// + /// activated ion electron photo detachment dissociation + /// + aEPD, + Unknown, AnyActivationType, Custom, diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index 0129154a4..05e5cfd1e 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -19,6 +19,7 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Text; using System.Text.RegularExpressions; namespace MzLibUtil @@ -122,6 +123,5 @@ public static string GetPeriodTolerantFilenameWithoutExtension(this string fileP { return PeriodTolerantFilenameWithoutExtension.GetPeriodTolerantFilenameWithoutExtension(filePath); } - } } \ No newline at end of file diff --git a/mzLib/Omics/Fragmentation/FragmentationTerminus.cs b/mzLib/Omics/Fragmentation/FragmentationTerminus.cs index 146309caa..788041690 100644 --- a/mzLib/Omics/Fragmentation/FragmentationTerminus.cs +++ b/mzLib/Omics/Fragmentation/FragmentationTerminus.cs @@ -1,19 +1,12 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Omics.Fragmentation +namespace Omics.Fragmentation { public enum FragmentationTerminus - { - Both, //N- and C-terminus - N, //N-terminus only - C, //C-terminus only + { + Both, //N- and C-terminus + N, //N-terminus only + C, //C-terminus only None, //used for internal fragments, could be used for top down intact mass? FivePrime, // 5' for NucleicAcids ThreePrime, // 3' for NucleicAcids - } - + } } diff --git a/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs b/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs index d5b020160..b2b7cd891 100644 --- a/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs +++ b/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs @@ -1 +1,2 @@ -using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using Chemistry; using MassSpectrometry; namespace Omics.Fragmentation.Oligo { /// /// Methods dealing with specific product type for RNA molecules /// public static class DissociationTypeCollection { /// /// Product Ion types by dissociation method /// private static readonly Dictionary> ProductsFromDissociationType = new Dictionary>() { { DissociationType.Unknown, new List() }, { DissociationType.CID, new List { ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M } }, { DissociationType.LowCID, new List() { } }, { DissociationType.IRMPD, new List() { } }, { DissociationType.ECD, new List { } }, { DissociationType.PQD, new List { ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, ProductType.dWaterLoss, ProductType.w, ProductType.x, ProductType.y, ProductType.yWaterLoss, ProductType.d, ProductType.M } }, { DissociationType.ETD, new List { } }, { DissociationType.HCD, new List { ProductType.w, ProductType.y, ProductType.aBaseLoss, ProductType.dWaterLoss, ProductType.M } }, { DissociationType.AnyActivationType, new List { } }, { DissociationType.EThcD, new List { } }, { DissociationType.Custom, new List { } }, { DissociationType.ISCID, new List { } } }; /// /// Returns list of products types based upon the dissociation type /// /// /// public static List GetRnaProductTypesFromDissociationType(this DissociationType dissociationType) => ProductsFromDissociationType[dissociationType]; /// /// Mass to be added or subtracted /// private static readonly Dictionary FragmentIonCaps = new Dictionary { { ProductType.a, ChemicalFormula.ParseFormula("H") }, { ProductType.aWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, { ProductType.b, ChemicalFormula.ParseFormula("OH") }, { ProductType.bWaterLoss, ChemicalFormula.ParseFormula("H-1") }, { ProductType.c, ChemicalFormula.ParseFormula("O3H2P") }, { ProductType.cWaterLoss, ChemicalFormula.ParseFormula("O2P") }, { ProductType.d, ChemicalFormula.ParseFormula("O4H2P") }, { ProductType.dWaterLoss, ChemicalFormula.ParseFormula("O3P") }, { ProductType.w, ChemicalFormula.ParseFormula("H") }, { ProductType.wWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, { ProductType.x, ChemicalFormula.ParseFormula("O-1H") }, { ProductType.xWaterLoss, ChemicalFormula.ParseFormula("O-2H-1") }, { ProductType.y, ChemicalFormula.ParseFormula("O-3P-1") }, { ProductType.yWaterLoss, ChemicalFormula.ParseFormula("O-4H-2P-1") }, { ProductType.z, ChemicalFormula.ParseFormula("O-4P-1") }, { ProductType.zWaterLoss, ChemicalFormula.ParseFormula("O-5H-2P-1") }, //fragment - Base chemical formula is the corresponding fragment chemical formula subtracing 1 H as H is lost when base is removed { ProductType.aBaseLoss, ChemicalFormula.ParseFormula("H-2") }, // "H-1" -H { ProductType.bBaseLoss, ChemicalFormula.ParseFormula("O1H-2") }, //"OH1" -H { ProductType.cBaseLoss, ChemicalFormula.ParseFormula("O3H-1P") }, //"O3P" -H { ProductType.dBaseLoss, ChemicalFormula.ParseFormula("O4H-1P") }, //"O4H2P" -H { ProductType.wBaseLoss, ChemicalFormula.ParseFormula("H-2") }, //"H"-H { ProductType.xBaseLoss, ChemicalFormula.ParseFormula("O-1H-2") }, //"O-1H" -H { ProductType.yBaseLoss, ChemicalFormula.ParseFormula("O-3H-2P-1") }, //"O-3P-1" -H { ProductType.zBaseLoss, ChemicalFormula.ParseFormula("O-4H-3P-1") }, //"O-4H-1P-1" -1 { ProductType.M, new ChemicalFormula() } }; /// /// Returns mass shift by product type /// /// /// public static double GetRnaMassShiftFromProductType(this ProductType type) => FragmentIonCaps[type].MonoisotopicMass; public static FragmentationTerminus GetRnaTerminusType(this ProductType fragmentType) { switch (fragmentType) { case ProductType.a: case ProductType.aWaterLoss: case ProductType.aBaseLoss: case ProductType.b: case ProductType.bWaterLoss: case ProductType.bBaseLoss: case ProductType.c: case ProductType.cWaterLoss: case ProductType.cBaseLoss: case ProductType.d: case ProductType.dWaterLoss: case ProductType.dBaseLoss: return FragmentationTerminus.FivePrime; case ProductType.w: case ProductType.wWaterLoss: case ProductType.wBaseLoss: case ProductType.x: case ProductType.xWaterLoss: case ProductType.xBaseLoss: case ProductType.y: case ProductType.yWaterLoss: case ProductType.yBaseLoss: case ProductType.z: case ProductType.zWaterLoss: case ProductType.zBaseLoss: return FragmentationTerminus.ThreePrime; case ProductType.M: return FragmentationTerminus.None; case ProductType.aStar: case ProductType.aDegree: case ProductType.bAmmoniaLoss: case ProductType.yAmmoniaLoss: case ProductType.zPlusOne: case ProductType.D: case ProductType.Ycore: case ProductType.Y: default: throw new ArgumentOutOfRangeException(nameof(fragmentType), fragmentType, null); } } /// /// Product ion types by Fragmentation Terminus /// private static readonly Dictionary> ProductIonTypesFromSpecifiedTerminus = new Dictionary> { { FragmentationTerminus.FivePrime, new List { ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, } }, { FragmentationTerminus.ThreePrime, new List { ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, } }, { FragmentationTerminus.Both, new List { ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, ProductType.M } } }; public static List GetRnaTerminusSpecificProductTypes( this FragmentationTerminus fragmentationTerminus) { return ProductIonTypesFromSpecifiedTerminus[fragmentationTerminus]; } /// /// Returns all product ion types based upon specified terminus /// /// /// /// public static List GetRnaTerminusSpecificProductTypesFromDissociation( this DissociationType dissociationType, FragmentationTerminus fragmentationTerminus) { var terminusSpecific = fragmentationTerminus.GetRnaTerminusSpecificProductTypes(); var dissociationSpecific = dissociationType.GetRnaProductTypesFromDissociationType(); return terminusSpecific.Intersect(dissociationSpecific).ToList(); } } } \ No newline at end of file +using Chemistry; using MassSpectrometry; namespace Omics.Fragmentation.Oligo { /// /// Methods dealing with specific product type for RNA molecules /// public static class DissociationTypeCollection { + /// /// Product Ion types by dissociation method /// /// /// HCD ions were taken from the following paper: https://www.nature.com/articles/s41598-023-36193-2 /// Ion types below here should be validated with experimental results. /// Base and water losses occur very frequently and may also be present in these activation types. /// CID, UVPD, and aEPD ions were taken from the following paper: https://pubs.acs.org/doi/10.1021/acs.analchem.3c05428?ref=PDF /// NETD ions were taken from the following paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7161943/ /// lowCID ions were taken from this Thermo Poster: https://assets.thermofisher.com/TFS-Assets/CMD/Flyers/fl-489263-asms23-optimized-fragmentation-oligonucleotides-suppresses-undesired-fragmentation-fl489263-en.pdf /// public static Dictionary> ProductsFromDissociationType = new Dictionary>() { { DissociationType.Unknown, new List() }, { DissociationType.Custom, new List() }, { DissociationType.AnyActivationType, new List { ProductType.a, ProductType.aBaseLoss, ProductType.aWaterLoss, ProductType.b, ProductType.bBaseLoss, ProductType.bWaterLoss, ProductType.c, ProductType.cBaseLoss, ProductType.cWaterLoss, ProductType.d, ProductType.dBaseLoss, ProductType.dWaterLoss, ProductType.w, ProductType.wBaseLoss, ProductType.wWaterLoss, ProductType.x, ProductType.xBaseLoss, ProductType.xWaterLoss, ProductType.y, ProductType.yBaseLoss, ProductType.yWaterLoss, ProductType.z, ProductType.zBaseLoss, ProductType.zWaterLoss, ProductType.M } }, { DissociationType.CID, new List { ProductType.a, ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M } }, { DissociationType.HCD, new List { ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, ProductType.dWaterLoss, ProductType.w, ProductType.x, ProductType.y, ProductType.z, ProductType.M } }, { DissociationType.UVPD, new List { ProductType.a, ProductType.c, ProductType.d, ProductType.w, ProductType.M } }, { DissociationType.aEPD, new List { ProductType.a, ProductType.c, ProductType.d, ProductType.w, ProductType.x, ProductType.z, ProductType.M } }, { DissociationType.NETD, new List { ProductType.w, ProductType.d, ProductType.M } }, { DissociationType.LowCID, new List() { ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M } }, { DissociationType.IRMPD, new List() { } }, { DissociationType.ECD, new List { } }, { DissociationType.PQD, new List { } }, { DissociationType.ETD, new List { } }, { DissociationType.EThcD, new List { } }, }; /// /// Returns list of products types based upon the dissociation type /// /// /// public static List GetRnaProductTypesFromDissociationType(this DissociationType dissociationType) => ProductsFromDissociationType[dissociationType]; /// /// Mass to be added or subtracted /// private static readonly Dictionary FragmentIonCaps = new Dictionary { { ProductType.a, ChemicalFormula.ParseFormula("H") }, { ProductType.aWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, { ProductType.b, ChemicalFormula.ParseFormula("OH") }, { ProductType.bWaterLoss, ChemicalFormula.ParseFormula("H-1") }, { ProductType.c, ChemicalFormula.ParseFormula("O3H2P") }, { ProductType.cWaterLoss, ChemicalFormula.ParseFormula("O2P") }, { ProductType.d, ChemicalFormula.ParseFormula("O4H2P") }, { ProductType.dWaterLoss, ChemicalFormula.ParseFormula("O3P") }, { ProductType.w, ChemicalFormula.ParseFormula("H") }, { ProductType.wWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, { ProductType.x, ChemicalFormula.ParseFormula("O-1H") }, { ProductType.xWaterLoss, ChemicalFormula.ParseFormula("O-2H-1") }, { ProductType.y, ChemicalFormula.ParseFormula("O-3P-1") }, { ProductType.yWaterLoss, ChemicalFormula.ParseFormula("O-4H-2P-1") }, { ProductType.z, ChemicalFormula.ParseFormula("O-4P-1") }, { ProductType.zWaterLoss, ChemicalFormula.ParseFormula("O-5H-2P-1") }, //fragment - Base chemical formula is the corresponding fragment chemical formula subtracing 1 H as H is lost when base is removed { ProductType.aBaseLoss, ChemicalFormula.ParseFormula("H-2") }, // "H-1" -H { ProductType.bBaseLoss, ChemicalFormula.ParseFormula("O1H-2") }, //"OH1" -H { ProductType.cBaseLoss, ChemicalFormula.ParseFormula("O3H-1P") }, //"O3P" -H { ProductType.dBaseLoss, ChemicalFormula.ParseFormula("O4H-1P") }, //"O4H2P" -H { ProductType.wBaseLoss, ChemicalFormula.ParseFormula("H-2") }, //"H"-H { ProductType.xBaseLoss, ChemicalFormula.ParseFormula("O-1H-2") }, //"O-1H" -H { ProductType.yBaseLoss, ChemicalFormula.ParseFormula("O-3H-2P-1") }, //"O-3P-1" -H { ProductType.zBaseLoss, ChemicalFormula.ParseFormula("O-4H-3P-1") }, //"O-4H-1P-1" -1 { ProductType.M, new ChemicalFormula() } }; /// /// Returns mass shift by product type /// /// /// public static double GetRnaMassShiftFromProductType(this ProductType type) => FragmentIonCaps[type].MonoisotopicMass; public static FragmentationTerminus GetRnaTerminusType(this ProductType fragmentType) { switch (fragmentType) { case ProductType.a: case ProductType.aWaterLoss: case ProductType.aBaseLoss: case ProductType.b: case ProductType.bWaterLoss: case ProductType.bBaseLoss: case ProductType.c: case ProductType.cWaterLoss: case ProductType.cBaseLoss: case ProductType.d: case ProductType.dWaterLoss: case ProductType.dBaseLoss: return FragmentationTerminus.FivePrime; case ProductType.w: case ProductType.wWaterLoss: case ProductType.wBaseLoss: case ProductType.x: case ProductType.xWaterLoss: case ProductType.xBaseLoss: case ProductType.y: case ProductType.yWaterLoss: case ProductType.yBaseLoss: case ProductType.z: case ProductType.zWaterLoss: case ProductType.zBaseLoss: return FragmentationTerminus.ThreePrime; case ProductType.M: return FragmentationTerminus.None; case ProductType.aStar: case ProductType.aDegree: case ProductType.bAmmoniaLoss: case ProductType.yAmmoniaLoss: case ProductType.zPlusOne: case ProductType.D: case ProductType.Ycore: case ProductType.Y: default: throw new ArgumentOutOfRangeException(nameof(fragmentType), fragmentType, null); } } /// /// Product ion types by Fragmentation Terminus /// private static readonly Dictionary> ProductIonTypesFromSpecifiedTerminus = new Dictionary> { { FragmentationTerminus.FivePrime, new List { ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, } }, { FragmentationTerminus.ThreePrime, new List { ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, } }, { FragmentationTerminus.Both, new List { ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, ProductType.M } } }; public static List GetRnaTerminusSpecificProductTypes( this FragmentationTerminus fragmentationTerminus) { return ProductIonTypesFromSpecifiedTerminus[fragmentationTerminus]; } /// /// Returns all product ion types based upon specified terminus /// /// /// /// public static List GetRnaTerminusSpecificProductTypesFromDissociation( this DissociationType dissociationType, FragmentationTerminus fragmentationTerminus) { var terminusSpecific = fragmentationTerminus.GetRnaTerminusSpecificProductTypes(); var dissociationSpecific = dissociationType.GetRnaProductTypesFromDissociationType(); return terminusSpecific.Intersect(dissociationSpecific).ToList(); } } } \ No newline at end of file diff --git a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs new file mode 100644 index 000000000..a741638c5 --- /dev/null +++ b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs @@ -0,0 +1,168 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Chemistry; +using Omics.Digestion; +using Omics.Modifications; + +namespace Transcriptomics.Digestion +{ + public class NucleolyticOligo : DigestionProduct + { + protected IHasChemicalFormula _fivePrimeTerminus; + protected IHasChemicalFormula _threePrimeTerminus; + + internal NucleolyticOligo(NucleicAcid nucleicAcid, int oneBaseStartResidue, + int oneBasedEndResidue, int missedCleavages, CleavageSpecificity cleavageSpecificity, + IHasChemicalFormula? fivePrimeTerminus, IHasChemicalFormula? threePrimeTerminus) + : base(nucleicAcid, oneBaseStartResidue, oneBasedEndResidue, missedCleavages, cleavageSpecificity) + { + _fivePrimeTerminus = fivePrimeTerminus ?? NucleicAcid.DefaultFivePrimeTerminus; + _threePrimeTerminus = threePrimeTerminus ?? NucleicAcid.DefaultThreePrimeTerminus; + } + + /// + /// Nucleic acid this oligo was digested from + /// + public NucleicAcid NucleicAcid + { + get => Parent as NucleicAcid; + protected set => Parent = value; + } + + public override string ToString() + { + return BaseSequence; + } + + internal IEnumerable GetModifiedOligos(IEnumerable allKnownFixedMods, + RnaDigestionParams digestionParams, List variableModifications) + { + int oligoLength = OneBasedEndResidue - OneBasedStartResidue + 1; + int maximumVariableModificationIsoforms = digestionParams.MaxModificationIsoforms; + int maxModsForOligo = digestionParams.MaxMods; + var twoBasedPossibleVariableAndLocalizeableModifications = new Dictionary>(oligoLength + 4); + + var fivePrimeVariableMods = new List(); + twoBasedPossibleVariableAndLocalizeableModifications.Add(1, fivePrimeVariableMods); + + var threePrimeVariableMods = new List(); + twoBasedPossibleVariableAndLocalizeableModifications.Add(oligoLength + 2, threePrimeVariableMods); + + // collect all possible variable mods, skipping if there is a database annotated modification + foreach (Modification variableModification in variableModifications) + { + // Check if can be a n-term mod + if (CanBeFivePrime(variableModification, oligoLength) && !ModificationLocalization.UniprotModExists(NucleicAcid, 1, variableModification)) + { + fivePrimeVariableMods.Add(variableModification); + } + + for (int r = 0; r < oligoLength; r++) + { + if (variableModification.LocationRestriction == "Anywhere." && + ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, r + 1, oligoLength, OneBasedStartResidue + r) + && !ModificationLocalization.UniprotModExists(NucleicAcid, r + 1, variableModification)) + { + if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) + { + residueVariableMods = new List { variableModification }; + twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); + } + else + { + residueVariableMods.Add(variableModification); + } + } + } + // Check if can be a c-term mod + if (CanBeThreePrime(variableModification, oligoLength) && !ModificationLocalization.UniprotModExists(NucleicAcid, oligoLength, variableModification)) + { + threePrimeVariableMods.Add(variableModification); + } + } + + // LOCALIZED MODS + foreach (var kvp in NucleicAcid.OneBasedPossibleLocalizedModifications) + { + bool inBounds = kvp.Key >= OneBasedStartResidue && kvp.Key <= OneBasedEndResidue; + if (!inBounds) + { + continue; + } + + int locInPeptide = kvp.Key - OneBasedStartResidue + 1; + foreach (Modification modWithMass in kvp.Value) + { + if (modWithMass is Modification variableModification) + { + // Check if can be a n-term mod + if (locInPeptide == 1 && CanBeFivePrime(variableModification, oligoLength) && !NucleicAcid.IsDecoy) + { + fivePrimeVariableMods.Add(variableModification); + } + + int r = locInPeptide - 1; + if (r >= 0 && r < oligoLength + && (NucleicAcid.IsDecoy || + (ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, r + 1, oligoLength, OneBasedStartResidue + r) + && variableModification.LocationRestriction == "Anywhere."))) + { + if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) + { + residueVariableMods = new List { variableModification }; + twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); + } + else + { + residueVariableMods.Add(variableModification); + } + } + + // Check if can be a c-term mod + if (locInPeptide == oligoLength && CanBeThreePrime(variableModification, oligoLength) && !NucleicAcid.IsDecoy) + { + threePrimeVariableMods.Add(variableModification); + } + } + } + } + + int variable_modification_isoforms = 0; + + foreach (Dictionary kvp in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForOligo, oligoLength)) + { + int numFixedMods = 0; + foreach (var ok in GetFixedModsOneIsNorFivePrimeTerminus(oligoLength, allKnownFixedMods)) + { + if (!kvp.ContainsKey(ok.Key)) + { + numFixedMods++; + kvp.Add(ok.Key, ok.Value); + } + } + yield return new OligoWithSetMods(NucleicAcid, digestionParams, OneBasedStartResidue, OneBasedEndResidue, MissedCleavages, + CleavageSpecificityForFdrCategory, kvp, numFixedMods, _fivePrimeTerminus, _threePrimeTerminus); + variable_modification_isoforms++; + if (variable_modification_isoforms == maximumVariableModificationIsoforms) + { + yield break; + } + } + } + + private bool CanBeFivePrime(Modification variableModification, int peptideLength) + { + return (variableModification.LocationRestriction == "5'-terminal." || variableModification.LocationRestriction == "Oligo 5'-terminal.") + && ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, 1, peptideLength, OneBasedStartResidue); + } + + private bool CanBeThreePrime(Modification variableModification, int peptideLength) + { + return (variableModification.LocationRestriction == "3'-terminal." || variableModification.LocationRestriction == "Oligo 3'-terminal.") + && ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, peptideLength, peptideLength, OneBasedStartResidue + peptideLength - 1); + } + } +} diff --git a/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs new file mode 100644 index 000000000..92b5e501c --- /dev/null +++ b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs @@ -0,0 +1,336 @@ +using Chemistry; +using MassSpectrometry; +using Omics.Digestion; +using Omics.Fragmentation; +using Omics.Modifications; +using Omics; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Security.Cryptography; +using System.Text; +using System.Threading.Tasks; +using Easy.Common.Extensions; +using Omics.Fragmentation.Oligo; + +namespace Transcriptomics.Digestion +{ + public class OligoWithSetMods : NucleolyticOligo, IBioPolymerWithSetMods, INucleicAcid + { + public OligoWithSetMods(NucleicAcid nucleicAcid, RnaDigestionParams digestionParams, int oneBaseStartResidue, + int oneBasedEndResidue, int missedCleavages, CleavageSpecificity cleavageSpecificity, + Dictionary allModsOneIsNTerminus, int numFixedMods, IHasChemicalFormula? fivePrimeTerminus = null, + IHasChemicalFormula? threePrimeTerminus = null) + : base(nucleicAcid, oneBaseStartResidue, oneBasedEndResidue, missedCleavages, + cleavageSpecificity, fivePrimeTerminus, threePrimeTerminus) + { + _digestionParams = digestionParams; + _allModsOneIsNterminus = allModsOneIsNTerminus; + NumFixedMods = numFixedMods; + FullSequence = this.DetermineFullSequence(); + } + + public OligoWithSetMods(string sequence, Dictionary allKnownMods, int numFixedMods = 0, + RnaDigestionParams digestionParams = null, NucleicAcid n = null, int oneBaseStartResidue = 1, int oneBasedEndResidue = 0, + int missedCleavages = 0, CleavageSpecificity cleavageSpecificity = CleavageSpecificity.Full, string description = null, + IHasChemicalFormula? fivePrimeTerminus = null, IHasChemicalFormula? threePrimeTerminus = null) + : base(n, oneBaseStartResidue, oneBasedEndResidue, missedCleavages, + cleavageSpecificity, fivePrimeTerminus, threePrimeTerminus) + { + if (sequence.Contains("|")) + { + throw new MzLibUtil.MzLibException("Ambiguous oligo cannot be parsed from string: " + sequence); + } + + FullSequence = sequence; + _baseSequence = IBioPolymerWithSetMods.GetBaseSequenceFromFullSequence(sequence); + GetModsAfterDeserialization(allKnownMods); + NumFixedMods = numFixedMods; + _digestionParams = digestionParams; + + if (n != null) + Parent = n; + } + + private RnaDigestionParams _digestionParams; + private Dictionary _allModsOneIsNterminus; + private double? _monoisotopicMass; + private ChemicalFormula? _thisChemicalFormula; + private double? _mostAbundantMonoisotopicMass; + private IDictionary>? _oneBasedPossibleLocalizedModifications; + + public string FullSequence { get; private set; } + public IDigestionParams DigestionParams => _digestionParams; + public IHasChemicalFormula FivePrimeTerminus + { + get => _fivePrimeTerminus; + set + { + _fivePrimeTerminus = value; + _monoisotopicMass = null; + _thisChemicalFormula = null; + _mostAbundantMonoisotopicMass = null; + } + } + + public IHasChemicalFormula ThreePrimeTerminus + { + get => _threePrimeTerminus; + set + { + _threePrimeTerminus = value; + _monoisotopicMass = null; + _thisChemicalFormula = null; + _mostAbundantMonoisotopicMass = null; + } + } + + public double MonoisotopicMass + { + get + { + if (_monoisotopicMass is null) + { + _monoisotopicMass = BaseSequence.Sum(nuc => Nucleotide.GetResidue(nuc).MonoisotopicMass) + + AllModsOneIsNterminus.Values.Sum(mod => mod.MonoisotopicMass.Value) + + FivePrimeTerminus.MonoisotopicMass + + ThreePrimeTerminus.MonoisotopicMass; + } + return _monoisotopicMass.Value; + } + } + + public ChemicalFormula ThisChemicalFormula + { + get + { + if (_thisChemicalFormula is null) + { + var fullFormula = new RNA(BaseSequence, FivePrimeTerminus, ThreePrimeTerminus).GetChemicalFormula(); + foreach (var mod in AllModsOneIsNterminus.Values) + { + if (mod.ChemicalFormula is null) + { + fullFormula = null; + break; + } + fullFormula.Add(mod.ChemicalFormula); + } + _thisChemicalFormula = fullFormula; + } + return _thisChemicalFormula!; + } + } + + public double MostAbundantMonoisotopicMass + { + get + { + if (_mostAbundantMonoisotopicMass is null) + { + var distribution = IsotopicDistribution.GetDistribution(ThisChemicalFormula); + double maxIntensity = distribution.Intensities.Max(); + _mostAbundantMonoisotopicMass = distribution.Masses[distribution.Intensities.IndexOf(maxIntensity)].RoundedDouble(); + } + return _mostAbundantMonoisotopicMass.Value; + } + } + + public string SequenceWithChemicalFormulas => throw new NotImplementedException(); + + public Dictionary AllModsOneIsNterminus => _allModsOneIsNterminus; + + public IDictionary> OneBasedPossibleLocalizedModifications => _oneBasedPossibleLocalizedModifications ??= + _allModsOneIsNterminus.ToDictionary(p => p.Key, p => new List() { p.Value }); + public int NumMods => AllModsOneIsNterminus.Count; + public int NumFixedMods { get; } + public int NumVariableMods => NumMods - NumFixedMods; + + /// + /// Generates theoretical fragments for given dissociation type for this peptide. + /// The "products" parameter is filled with these fragments. + /// + public void Fragment(DissociationType dissociationType, FragmentationTerminus fragmentationTerminus, + List products) + { + products.Clear(); + + List fivePrimeProductTypes = + dissociationType.GetRnaTerminusSpecificProductTypesFromDissociation(FragmentationTerminus.FivePrime); + List threePrimeProductTypes = + dissociationType.GetRnaTerminusSpecificProductTypesFromDissociation(FragmentationTerminus.ThreePrime); + + bool calculateFivePrime = + fragmentationTerminus is FragmentationTerminus.FivePrime or FragmentationTerminus.Both; + bool calculateThreePrime = + fragmentationTerminus is FragmentationTerminus.ThreePrime or FragmentationTerminus.Both; + + var sequence = (Parent as NucleicAcid)!.NucleicAcidArray[(OneBasedStartResidue - 1)..OneBasedEndResidue]; + + // intact product ion + if (fragmentationTerminus is FragmentationTerminus.Both or FragmentationTerminus.None) + products.AddRange(GetNeutralFragments(ProductType.M, sequence)); + + if (calculateFivePrime) + foreach (var type in fivePrimeProductTypes) + products.AddRange(GetNeutralFragments(type, sequence)); + + if (calculateThreePrime) + foreach (var type in threePrimeProductTypes) + products.AddRange(GetNeutralFragments(type, sequence)); + } + + /// + /// Generates theoretical internal fragments for given dissociation type for this peptide. + /// The "products" parameter is filled with these fragments. + /// The "minLengthOfFragments" parameter is the minimum number of nucleic acids for an internal fragment to be included + /// + public void FragmentInternally(DissociationType dissociationType, int minLengthOfFragments, + List products) + { + throw new NotImplementedException(); + } + + /// + /// Calculates all the fragments of the types you specify + /// + /// product type to get neutral fragments from + /// Sequence to generate fragments from, will be calculated from the parent if left null + /// + public IEnumerable GetNeutralFragments(ProductType type, Nucleotide[]? sequence = null) + { + sequence ??= (Parent as NucleicAcid)!.NucleicAcidArray[(OneBasedStartResidue - 1)..OneBasedEndResidue]; + + if (type is ProductType.M) + { + yield return new Product(type, FragmentationTerminus.None, MonoisotopicMass, 0, 0, 0); + yield break; + } + + // determine mass of piece remaining after fragmentation + double monoMass = type.GetRnaMassShiftFromProductType(); + + // determine mass of terminal cap and add to fragment + bool isThreePrimeTerminal = type.GetRnaTerminusType() == FragmentationTerminus.ThreePrime; + IHasChemicalFormula terminus = isThreePrimeTerminal ? ThreePrimeTerminus : FivePrimeTerminus; + monoMass += terminus.MonoisotopicMass; + + // determine mass of each polymer component that is contained within the fragment and add to fragment + bool first = true; //set first to true to hand the terminus mod first + for (int i = 0; i <= BaseSequence.Length - 1; i++) + { + int naIndex = isThreePrimeTerminal ? Length - i : i - 1; + if (first) + { + first = false; //set to false so only handled once + continue; + } + monoMass += sequence[naIndex].MonoisotopicMass; + + if (i < 1) + continue; + + // add side-chain mod + if (AllModsOneIsNterminus.TryGetValue(naIndex + 2, out Modification mod)) + { + monoMass += mod.MonoisotopicMass ?? 0; + } + + var previousNucleotide = sequence[naIndex]; + + double neutralLoss = 0; + if (type.ToString().Contains("Base")) + { + neutralLoss = previousNucleotide.BaseChemicalFormula.MonoisotopicMass; + } + + yield return new Product(type, + isThreePrimeTerminal ? FragmentationTerminus.ThreePrime : FragmentationTerminus.FivePrime, + monoMass - neutralLoss, i, + isThreePrimeTerminal ? BaseSequence.Length - i : i, 0, null, 0); + } + } + + public IBioPolymerWithSetMods Localize(int j, double massToLocalize) + { + var dictWithLocalizedMass = new Dictionary(AllModsOneIsNterminus); + double massOfExistingMod = 0; + if (dictWithLocalizedMass.TryGetValue(j + 2, out Modification modToReplace)) + { + massOfExistingMod = (double)modToReplace.MonoisotopicMass; + dictWithLocalizedMass.Remove(j + 2); + } + + dictWithLocalizedMass.Add(j + 2, new Modification(_locationRestriction: "Anywhere.", _monoisotopicMass: massToLocalize + massOfExistingMod)); + + var peptideWithLocalizedMass = new OligoWithSetMods(NucleicAcid, _digestionParams, OneBasedStartResidue, OneBasedEndResidue, MissedCleavages, + CleavageSpecificityForFdrCategory, dictWithLocalizedMass, NumFixedMods, FivePrimeTerminus, ThreePrimeTerminus); + + return peptideWithLocalizedMass; + } + + private void GetModsAfterDeserialization(Dictionary idToMod) + { + _allModsOneIsNterminus = new Dictionary(); + int currentModStart = 0; + int currentModificationLocation = 1; + bool currentlyReadingMod = false; + int bracketCount = 0; + + for (int r = 0; r < FullSequence.Length; r++) + { + char c = FullSequence[r]; + if (c == '[') + { + currentlyReadingMod = true; + if (bracketCount == 0) + { + currentModStart = r + 1; + } + + bracketCount++; + } + else if (c == ']') + { + string modId = null; + bracketCount--; + if (bracketCount == 0) + { + try + { + //remove the beginning section (e.g. "Fixed", "Variable", "Uniprot") + string modString = FullSequence.Substring(currentModStart, r - currentModStart); + int splitIndex = modString.IndexOf(':'); + string modType = modString.Substring(0, splitIndex); + modId = modString.Substring(splitIndex + 1, modString.Length - splitIndex - 1); + } + catch (Exception e) + { + throw new MzLibUtil.MzLibException( + "Error while trying to parse string into peptide: " + e.Message); + } + + if (!idToMod.TryGetValue(modId, out Modification mod)) + { + throw new MzLibUtil.MzLibException( + "Could not find modification while reading string: " + FullSequence); + } + + if (mod.LocationRestriction.Contains("C-terminal.") && r == FullSequence.Length - 1) + { + currentModificationLocation = BaseSequence.Length + 2; + } + + _allModsOneIsNterminus.Add(currentModificationLocation, mod); + currentlyReadingMod = false; + } + } + else if (!currentlyReadingMod) + { + currentModificationLocation++; + } + //else do nothing + } + } + } +} diff --git a/mzLib/Transcriptomics/Digestion/RnaDigestionParams.cs b/mzLib/Transcriptomics/Digestion/RnaDigestionParams.cs new file mode 100644 index 000000000..379e48fa9 --- /dev/null +++ b/mzLib/Transcriptomics/Digestion/RnaDigestionParams.cs @@ -0,0 +1,51 @@ +using Omics.Digestion; +using Omics.Fragmentation; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Transcriptomics.Digestion +{ + public class RnaDigestionParams : IDigestionParams + { + + // this parameterless constructor needs to exist to read the toml. + // if you can figure out a way to get rid of it, feel free... + public RnaDigestionParams() : this("top-down") + { + } + + public RnaDigestionParams(string rnase = "top-down", int maxMissedCleavages = 0, int minLength = 3, + int maxLength = int.MaxValue, int maxModificationIsoforms = 1024, int maxMods = 2, + FragmentationTerminus fragmentationTerminus = FragmentationTerminus.Both) + { + Rnase = RnaseDictionary.Dictionary[rnase]; + MaxMissedCleavages = maxMissedCleavages; + MinLength = minLength; + MaxLength = maxLength; + MaxMods = maxMods; + MaxModificationIsoforms = maxModificationIsoforms; + FragmentationTerminus = fragmentationTerminus; + } + + public int MaxMissedCleavages { get; set; } + public int MinLength { get; set; } + public int MaxLength { get; set; } + public int MaxModificationIsoforms { get; set; } + public int MaxMods { get; set; } + public DigestionAgent DigestionAgent => Rnase; + public Rnase Rnase { get; private set; } + public FragmentationTerminus FragmentationTerminus { get; set; } + public CleavageSpecificity SearchModeType { get; set; } = CleavageSpecificity.Full; + public IDigestionParams Clone(FragmentationTerminus? newTerminus = null) + { + return newTerminus.HasValue + ? new RnaDigestionParams(Rnase.Name, MaxMissedCleavages, MinLength, MaxLength, + MaxModificationIsoforms, MaxMods, newTerminus.Value) + : new RnaDigestionParams(Rnase.Name, MaxMissedCleavages, MinLength, MaxLength, + MaxModificationIsoforms, MaxMods, FragmentationTerminus); + } + } +} diff --git a/mzLib/Transcriptomics/Digestion/Rnase.cs b/mzLib/Transcriptomics/Digestion/Rnase.cs index 646bbc8d1..3670f1b3c 100644 --- a/mzLib/Transcriptomics/Digestion/Rnase.cs +++ b/mzLib/Transcriptomics/Digestion/Rnase.cs @@ -1,4 +1,5 @@ -using Omics.Digestion; +using Chemistry; +using Omics.Digestion; using Omics.Modifications; namespace Transcriptomics.Digestion @@ -13,10 +14,59 @@ public Rnase(string name, CleavageSpecificity cleaveSpecificity, List GetUnmodifiedOligos(NucleicAcid nucleicAcid, int maxMissedCleavages, int minLength, int maxLength) - // private IEnumerable FullDigestion(NucleicAcid nucleicAcid, int maxMissedCleavages, int minLength, int maxLength) - + public List GetUnmodifiedOligos(NucleicAcid nucleicAcid, int maxMissedCleavages, int minLength, + int maxLength) + { + var oligos = new List(); + + // top down + if (CleavageSpecificity == CleavageSpecificity.None) + { + if (ValidLength(nucleicAcid.Length, minLength, maxLength)) + oligos.Add(new NucleolyticOligo(nucleicAcid, 1, nucleicAcid.Length, + 0, CleavageSpecificity.Full, nucleicAcid.FivePrimeTerminus, nucleicAcid.ThreePrimeTerminus)); + } + // full cleavage + else if (CleavageSpecificity == CleavageSpecificity.Full) + { + oligos.AddRange(FullDigestion(nucleicAcid, maxMissedCleavages, minLength, maxLength)); + } + else + { + throw new ArgumentException( + "Cleave Specificity not defined for Rna digestion, currently supports Full and None"); + } + + return oligos; + } + + private IEnumerable FullDigestion(NucleicAcid nucleicAcid, int maxMissedCleavages, + int minLength, int maxLength) + { + List oneBasedIndicesToCleaveAfter = GetDigestionSiteIndices(nucleicAcid.BaseSequence); + for (int missedCleavages = 0; missedCleavages <= maxMissedCleavages; missedCleavages++) + { + for (int i = 0; i < oneBasedIndicesToCleaveAfter.Count - missedCleavages - 1; i++) + { + if (ValidLength(oneBasedIndicesToCleaveAfter[i + missedCleavages + 1] - oneBasedIndicesToCleaveAfter[i], + minLength, maxLength)) + { + int oneBasedStartResidue = oneBasedIndicesToCleaveAfter[i] + 1; + int oneBasedEndResidue = oneBasedIndicesToCleaveAfter[i + missedCleavages + 1]; + + // contains original 5' terminus ? keep it : set to OH + IHasChemicalFormula fivePrimeTerminus = oneBasedStartResidue == 1 ? nucleicAcid.FivePrimeTerminus : ChemicalFormula.ParseFormula("O-3P-1"); + + // contains original 3' terminus ? keep it : set to phosphate + IHasChemicalFormula threePrimeTerminus = oneBasedEndResidue == nucleicAcid.Length ? nucleicAcid.ThreePrimeTerminus : ChemicalFormula.ParseFormula("H2O4P"); + + yield return new NucleolyticOligo(nucleicAcid, oneBasedStartResidue, oneBasedEndResidue, + missedCleavages, CleavageSpecificity.Full, fivePrimeTerminus, threePrimeTerminus); + } + } + } + } + public bool Equals(Rnase? other) { if (ReferenceEquals(null, other)) return false; diff --git a/mzLib/Transcriptomics/Interfaces/INucleicAcid.cs b/mzLib/Transcriptomics/Interfaces/INucleicAcid.cs index 3d55d2ef4..d2052aee3 100644 --- a/mzLib/Transcriptomics/Interfaces/INucleicAcid.cs +++ b/mzLib/Transcriptomics/Interfaces/INucleicAcid.cs @@ -4,7 +4,7 @@ namespace Transcriptomics { - public interface INucleicAcid : IHasChemicalFormula, IBioPolymer + public interface INucleicAcid : IHasChemicalFormula { /// /// The amino acid sequence diff --git a/mzLib/Transcriptomics/NucleicAcid.cs b/mzLib/Transcriptomics/NucleicAcid.cs new file mode 100644 index 000000000..ef6b74cf9 --- /dev/null +++ b/mzLib/Transcriptomics/NucleicAcid.cs @@ -0,0 +1,356 @@ +using Chemistry; +using Omics.Digestion; +using Omics.Modifications; +using Omics; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Transcriptomics.Digestion; + +namespace Transcriptomics +{ + /// + /// A linear polymer of Nucleic acids + /// + public abstract class NucleicAcid : INucleicAcid, IBioPolymer, IEquatable + { + + #region Static Properties + + /// + /// The default chemical formula of the five prime (hydroxyl group) + /// + /// + /// This means that the five prime cap will remove the excess components of first nucleotides + /// phospho group, leaving only the hydroxyl. This formula will be used for the five prime cap, unless + /// the nucleic acid is constructed with a different chemical formula + /// + public static readonly ChemicalFormula DefaultFivePrimeTerminus = ChemicalFormula.ParseFormula("O-3P-1"); + + /// + /// The default chemical formula of the three prime terminus (hydroxyl group) + /// + /// + /// This is used to account for the mass of the additional hydroxyl group at the three end of most oligonucleotides. + /// This formula will be used for the three prime cap, unless the nucleic acid is constructed with a different + /// chemical formula + /// + public static readonly ChemicalFormula DefaultThreePrimeTerminus = ChemicalFormula.ParseFormula("OH"); + + #endregion + + #region Constuctors + + protected NucleicAcid(string sequence, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null, + IDictionary>? oneBasedPossibleLocalizedModifications = null) + { + MonoisotopicMass = 0; + Length = sequence.Length; + _nucleicAcids = new Nucleotide[Length]; + ThreePrimeTerminus = threePrimeTerm ??= DefaultThreePrimeTerminus; + FivePrimeTerminus = fivePrimeTerm ??= DefaultFivePrimeTerminus; + _oneBasedPossibleLocalizedModifications = oneBasedPossibleLocalizedModifications ?? new Dictionary>(); + GeneNames = new List>(); + + + ParseSequence(sequence); + } + + protected NucleicAcid(string sequence, string name, string identifier, string organism, string databaseFilePath, + IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null, + IDictionary>? oneBasedPossibleLocalizedModifications = null, + bool isContaminant = false, bool isDecoy = false, + Dictionary? additionalDatabaseFields = null) + : this(sequence, fivePrimeTerm, threePrimeTerm, oneBasedPossibleLocalizedModifications) + { + Name = name; + DatabaseFilePath = databaseFilePath; + IsDecoy = isDecoy; + IsContaminant = isContaminant; + Organism = organism; + Accession = identifier; + AdditionalDatabaseFields = additionalDatabaseFields; + } + + #endregion + + #region Private Properties + + /// + /// The 5-Prime chemical formula cap + /// + private IHasChemicalFormula _5PrimeTerminus; + + /// + /// The 3-Prime chemical formula cap + /// + private IHasChemicalFormula _3PrimeTerminus; + + /// + /// All of the nucleic acid residues indexed by position from 5- to 3-prime. + /// + private Nucleotide[] _nucleicAcids; + + /// + /// The nucleic acid sequence. Is ignored if 'StoreSequenceString' is false + /// + private string _sequence; + + private IDictionary> _oneBasedPossibleLocalizedModifications; + + #endregion + + + #region Public Properties + + /// + /// Gets or sets the 5' terminus of this nucleic acid polymer + /// + public IHasChemicalFormula FivePrimeTerminus + { + get => _5PrimeTerminus; + set => ReplaceTerminus(ref _5PrimeTerminus, value); + } + + /// + /// Gets or sets the 3' terminus of this nucleic acid polymer + /// + public IHasChemicalFormula ThreePrimeTerminus + { + get => _3PrimeTerminus; + set => ReplaceTerminus(ref _3PrimeTerminus, value); + } + + /// + /// Gets the number of nucleic acids in this nucleic acid polymer + /// + public int Length { get; private set; } + + + // TODO: These interface members + public string Name { get; } + public string FullName => Name; // TODO: Consider if this needs to be different from the name + public string DatabaseFilePath { get; } + public bool IsDecoy { get; } + public bool IsContaminant { get; } + public string Accession { get; } + + public IDictionary> OneBasedPossibleLocalizedModifications => _oneBasedPossibleLocalizedModifications; + public string Organism { get; } + + /// + /// The list of gene names consists of tuples, where Item1 is the type of gene name, and Item2 is the name. There may be many genes and names of a certain type produced when reading an XML protein database. + /// + public IEnumerable> GeneNames { get; } + public Dictionary? AdditionalDatabaseFields { get; } + + /// + /// The total monoisotopic mass of this peptide and all of its modifications + /// + public double MonoisotopicMass { get; private set; } + + /// + /// Returns a copy of the nucleic acid array, used for -base mass calculations. + /// + public Nucleotide[] NucleicAcidArray => _nucleicAcids; + + public ChemicalFormula ThisChemicalFormula => GetChemicalFormula(); + + #endregion + + #region Nucleic Acid Sequence + + /// + /// Gets the base nucleic acid sequence + /// + public string BaseSequence + { + get + { + // Generate the sequence if the stored version is null or empty + if (string.IsNullOrEmpty(_sequence)) + { + _sequence = new string(_nucleicAcids.Select(na => na.Letter).ToArray()); + } + + return _sequence; + } + } + + public char this[int zeroBasedIndex] => BaseSequence[zeroBasedIndex]; + + #endregion + + #region Digestion + + public IEnumerable Digest(IDigestionParams digestionParameters, List allKnownFixedMods, + List variableModifications, List silacLabels = null, (SilacLabel startLabel, SilacLabel endLabel)? turnoverLabels = null, + bool topDownTruncationSearch = false) + { + if (digestionParameters is not RnaDigestionParams digestionParams) + throw new ArgumentException( + "DigestionParameters must be of type DigestionParams for protein digestion"); + allKnownFixedMods ??= new(); + variableModifications ??= new(); + + // digest based upon base sequence + foreach (var unmodifiedOligo in digestionParams.Rnase.GetUnmodifiedOligos(this, + digestionParams.MaxMissedCleavages, digestionParams.MinLength, digestionParams.MaxLength)) + { + // add fixed and variable mods to base sequence digestion products + foreach (var modifiedOligo in unmodifiedOligo.GetModifiedOligos(allKnownFixedMods, digestionParams, + variableModifications)) + { + yield return modifiedOligo; + } + } + } + + public IEnumerable Digest(RnaDigestionParams digestionParameters, + List allKnownFixedMods, + List variableModifications, List silacLabels = null, + (SilacLabel startLabel, SilacLabel endLabel)? turnoverLabels = null, + bool topDownTruncationSearch = false) + { + return Digest((IDigestionParams)digestionParameters, allKnownFixedMods, variableModifications, silacLabels, turnoverLabels, topDownTruncationSearch) + .Cast(); + } + + #endregion + + #region Electrospray + + public IEnumerable GetElectrospraySeries(int minCharge, int maxCharge) + { + for (int i = minCharge; i < maxCharge; i++) + { + yield return this.ToMz(i); + } + } + + #endregion + + #region Chemical Formula + + public ChemicalFormula GetChemicalFormula() + { + var formula = new ChemicalFormula(); + + // Handle 5'-Terminus + formula.Add(FivePrimeTerminus.ThisChemicalFormula); + + // Handle 3'-Terminus + formula.Add(ThreePrimeTerminus.ThisChemicalFormula); + + // Handle Nucleic Acid Residues + for (int i = 0; i < Length; i++) + { + formula.Add(_nucleicAcids[i].ThisChemicalFormula); + } + + return formula; + } + + #endregion + + #region Private Methods + + bool ReplaceTerminus(ref IHasChemicalFormula terminus, IHasChemicalFormula value) + { + if (Equals(value, terminus)) + return false; + + if (terminus != null) + MonoisotopicMass -= terminus.MonoisotopicMass; + + terminus = value; + + if (value != null) + MonoisotopicMass += value.MonoisotopicMass; + + return true; + } + + /// + /// Parses a string sequence of nucleic acids characters into a peptide object + /// + /// + /// + private bool ParseSequence(string sequence) + { + if (string.IsNullOrEmpty(sequence)) + return false; + + int index = 0; + + double monoMass = 0; + ChemicalFormula chemFormula = new(); + + StringBuilder sb = null; + sb = new StringBuilder(sequence.Length); + + foreach (char letter in sequence) + { + Nucleotide residue; + if (Nucleotide.TryGetResidue(letter, out residue)) + { + _nucleicAcids[index++] = residue; + sb.Append(residue.Letter); + monoMass += residue.MonoisotopicMass; + } + else + { + switch (letter) + { + case ' ': // ignore spaces + break; + + case '*': // ignore * + break; + + default: + throw new ArgumentException(string.Format( + "Nucleic Acid Letter {0} does not exist in the Nucleic Acid Dictionary. {0} is also not a valid character", + letter)); + } + } + } + + _sequence = sb.ToString(); + Length = index; + MonoisotopicMass += monoMass; + Array.Resize(ref _nucleicAcids, Length); + + return true; + } + + #endregion + + #region Interface Implemntations and Overrides + + public bool Equals(NucleicAcid? other) + { + if (ReferenceEquals(null, other)) return false; + if (ReferenceEquals(this, other)) return true; + return _5PrimeTerminus.Equals(other._5PrimeTerminus) + && _3PrimeTerminus.Equals(other._3PrimeTerminus); + } + + public override bool Equals(object? obj) + { + if (ReferenceEquals(null, obj)) return false; + if (ReferenceEquals(this, obj)) return true; + if (obj.GetType() != this.GetType()) return false; + return Equals((NucleicAcid)obj); + } + + public override int GetHashCode() + { + return HashCode.Combine(_5PrimeTerminus, _3PrimeTerminus, _sequence); + } + + #endregion + } +} diff --git a/mzLib/Transcriptomics/RNA.cs b/mzLib/Transcriptomics/RNA.cs new file mode 100644 index 000000000..3e72c1f14 --- /dev/null +++ b/mzLib/Transcriptomics/RNA.cs @@ -0,0 +1,51 @@ +using Chemistry; +using Omics.Modifications; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Transcriptomics +{ + public class RNA : NucleicAcid + { + /// + /// For constructing RNA from a string + /// + /// + /// + /// + /// + public RNA(string sequence, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null, + IDictionary>? oneBasedPossibleLocalizedModifications = null) + : base(sequence, fivePrimeTerm, threePrimeTerm, oneBasedPossibleLocalizedModifications) + { + } + + /// + /// For use with RNA loaded from a database + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + public RNA(string sequence, string name, string identifier, string organism, string databaseFilePath, + IHasChemicalFormula? fivePrimeTerminus = null, IHasChemicalFormula? threePrimeTerminus = null, + IDictionary>? oneBasedPossibleModifications = null, + bool isContaminant = false, bool isDecoy = false, + Dictionary? databaseAdditionalFields = null) + : base(sequence, name, identifier, organism, databaseFilePath, fivePrimeTerminus, threePrimeTerminus, + oneBasedPossibleModifications, isContaminant, isDecoy, databaseAdditionalFields) + { + + } + } +} diff --git a/mzLib/mzLib.sln.DotSettings b/mzLib/mzLib.sln.DotSettings index 78477fa52..06594535d 100644 --- a/mzLib/mzLib.sln.DotSettings +++ b/mzLib/mzLib.sln.DotSettings @@ -1,11 +1,14 @@ - + True True True True True + True + True True True True + True True True \ No newline at end of file From de33dfa648cae8b09e1efb309d12161b59f54186 Mon Sep 17 00:00:00 2001 From: nbollis Date: Thu, 19 Sep 2024 17:30:43 -0500 Subject: [PATCH 02/17] Implemented all tests --- .../Oligo/DissociationTypeCollection.cs | 4 +- mzLib/Test/Transcriptomics/TestDigestion.cs | 780 ++++++++++++++++++ .../Test/Transcriptomics/TestFragmentation.cs | 239 ++++++ mzLib/Test/Transcriptomics/TestNucleicAcid.cs | 171 ++++ mzLib/Test/Transcriptomics/TestProductType.cs | 280 +++++++ mzLib/Test/Transcriptomics/TestRnase.cs | 3 +- mzLib/Transcriptomics/NucleicAcid.cs | 33 +- 7 files changed, 1498 insertions(+), 12 deletions(-) create mode 100644 mzLib/Test/Transcriptomics/TestDigestion.cs create mode 100644 mzLib/Test/Transcriptomics/TestFragmentation.cs create mode 100644 mzLib/Test/Transcriptomics/TestNucleicAcid.cs create mode 100644 mzLib/Test/Transcriptomics/TestProductType.cs diff --git a/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs b/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs index b2b7cd891..3bc08d089 100644 --- a/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs +++ b/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs @@ -1,2 +1,4 @@ using Chemistry; using MassSpectrometry; namespace Omics.Fragmentation.Oligo { /// /// Methods dealing with specific product type for RNA molecules /// public static class DissociationTypeCollection { - /// /// Product Ion types by dissociation method /// /// /// HCD ions were taken from the following paper: https://www.nature.com/articles/s41598-023-36193-2 /// Ion types below here should be validated with experimental results. /// Base and water losses occur very frequently and may also be present in these activation types. /// CID, UVPD, and aEPD ions were taken from the following paper: https://pubs.acs.org/doi/10.1021/acs.analchem.3c05428?ref=PDF /// NETD ions were taken from the following paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7161943/ /// lowCID ions were taken from this Thermo Poster: https://assets.thermofisher.com/TFS-Assets/CMD/Flyers/fl-489263-asms23-optimized-fragmentation-oligonucleotides-suppresses-undesired-fragmentation-fl489263-en.pdf /// public static Dictionary> ProductsFromDissociationType = new Dictionary>() { { DissociationType.Unknown, new List() }, { DissociationType.Custom, new List() }, { DissociationType.AnyActivationType, new List { ProductType.a, ProductType.aBaseLoss, ProductType.aWaterLoss, ProductType.b, ProductType.bBaseLoss, ProductType.bWaterLoss, ProductType.c, ProductType.cBaseLoss, ProductType.cWaterLoss, ProductType.d, ProductType.dBaseLoss, ProductType.dWaterLoss, ProductType.w, ProductType.wBaseLoss, ProductType.wWaterLoss, ProductType.x, ProductType.xBaseLoss, ProductType.xWaterLoss, ProductType.y, ProductType.yBaseLoss, ProductType.yWaterLoss, ProductType.z, ProductType.zBaseLoss, ProductType.zWaterLoss, ProductType.M } }, { DissociationType.CID, new List { ProductType.a, ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M } }, { DissociationType.HCD, new List { ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, ProductType.dWaterLoss, ProductType.w, ProductType.x, ProductType.y, ProductType.z, ProductType.M } }, { DissociationType.UVPD, new List { ProductType.a, ProductType.c, ProductType.d, ProductType.w, ProductType.M } }, { DissociationType.aEPD, new List { ProductType.a, ProductType.c, ProductType.d, ProductType.w, ProductType.x, ProductType.z, ProductType.M } }, { DissociationType.NETD, new List { ProductType.w, ProductType.d, ProductType.M } }, { DissociationType.LowCID, new List() { ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M } }, { DissociationType.IRMPD, new List() { } }, { DissociationType.ECD, new List { } }, { DissociationType.PQD, new List { } }, { DissociationType.ETD, new List { } }, { DissociationType.EThcD, new List { } }, }; /// /// Returns list of products types based upon the dissociation type /// /// /// public static List GetRnaProductTypesFromDissociationType(this DissociationType dissociationType) => ProductsFromDissociationType[dissociationType]; /// /// Mass to be added or subtracted /// private static readonly Dictionary FragmentIonCaps = new Dictionary { { ProductType.a, ChemicalFormula.ParseFormula("H") }, { ProductType.aWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, { ProductType.b, ChemicalFormula.ParseFormula("OH") }, { ProductType.bWaterLoss, ChemicalFormula.ParseFormula("H-1") }, { ProductType.c, ChemicalFormula.ParseFormula("O3H2P") }, { ProductType.cWaterLoss, ChemicalFormula.ParseFormula("O2P") }, { ProductType.d, ChemicalFormula.ParseFormula("O4H2P") }, { ProductType.dWaterLoss, ChemicalFormula.ParseFormula("O3P") }, { ProductType.w, ChemicalFormula.ParseFormula("H") }, { ProductType.wWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, { ProductType.x, ChemicalFormula.ParseFormula("O-1H") }, { ProductType.xWaterLoss, ChemicalFormula.ParseFormula("O-2H-1") }, { ProductType.y, ChemicalFormula.ParseFormula("O-3P-1") }, { ProductType.yWaterLoss, ChemicalFormula.ParseFormula("O-4H-2P-1") }, { ProductType.z, ChemicalFormula.ParseFormula("O-4P-1") }, { ProductType.zWaterLoss, ChemicalFormula.ParseFormula("O-5H-2P-1") }, //fragment - Base chemical formula is the corresponding fragment chemical formula subtracing 1 H as H is lost when base is removed { ProductType.aBaseLoss, ChemicalFormula.ParseFormula("H-2") }, // "H-1" -H { ProductType.bBaseLoss, ChemicalFormula.ParseFormula("O1H-2") }, //"OH1" -H { ProductType.cBaseLoss, ChemicalFormula.ParseFormula("O3H-1P") }, //"O3P" -H { ProductType.dBaseLoss, ChemicalFormula.ParseFormula("O4H-1P") }, //"O4H2P" -H { ProductType.wBaseLoss, ChemicalFormula.ParseFormula("H-2") }, //"H"-H { ProductType.xBaseLoss, ChemicalFormula.ParseFormula("O-1H-2") }, //"O-1H" -H { ProductType.yBaseLoss, ChemicalFormula.ParseFormula("O-3H-2P-1") }, //"O-3P-1" -H { ProductType.zBaseLoss, ChemicalFormula.ParseFormula("O-4H-3P-1") }, //"O-4H-1P-1" -1 { ProductType.M, new ChemicalFormula() } }; /// /// Returns mass shift by product type /// /// /// public static double GetRnaMassShiftFromProductType(this ProductType type) => FragmentIonCaps[type].MonoisotopicMass; public static FragmentationTerminus GetRnaTerminusType(this ProductType fragmentType) { switch (fragmentType) { case ProductType.a: case ProductType.aWaterLoss: case ProductType.aBaseLoss: case ProductType.b: case ProductType.bWaterLoss: case ProductType.bBaseLoss: case ProductType.c: case ProductType.cWaterLoss: case ProductType.cBaseLoss: case ProductType.d: case ProductType.dWaterLoss: case ProductType.dBaseLoss: return FragmentationTerminus.FivePrime; case ProductType.w: case ProductType.wWaterLoss: case ProductType.wBaseLoss: case ProductType.x: case ProductType.xWaterLoss: case ProductType.xBaseLoss: case ProductType.y: case ProductType.yWaterLoss: case ProductType.yBaseLoss: case ProductType.z: case ProductType.zWaterLoss: case ProductType.zBaseLoss: return FragmentationTerminus.ThreePrime; case ProductType.M: return FragmentationTerminus.None; case ProductType.aStar: case ProductType.aDegree: case ProductType.bAmmoniaLoss: case ProductType.yAmmoniaLoss: case ProductType.zPlusOne: case ProductType.D: case ProductType.Ycore: case ProductType.Y: default: throw new ArgumentOutOfRangeException(nameof(fragmentType), fragmentType, null); } } /// /// Product ion types by Fragmentation Terminus /// private static readonly Dictionary> ProductIonTypesFromSpecifiedTerminus = new Dictionary> { { FragmentationTerminus.FivePrime, new List { ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, } }, { FragmentationTerminus.ThreePrime, new List { ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, } }, { FragmentationTerminus.Both, new List { ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, ProductType.M } } }; public static List GetRnaTerminusSpecificProductTypes( this FragmentationTerminus fragmentationTerminus) { return ProductIonTypesFromSpecifiedTerminus[fragmentationTerminus]; } /// /// Returns all product ion types based upon specified terminus /// /// /// /// public static List GetRnaTerminusSpecificProductTypesFromDissociation( this DissociationType dissociationType, FragmentationTerminus fragmentationTerminus) { var terminusSpecific = fragmentationTerminus.GetRnaTerminusSpecificProductTypes(); var dissociationSpecific = dissociationType.GetRnaProductTypesFromDissociationType(); return terminusSpecific.Intersect(dissociationSpecific).ToList(); } } } \ No newline at end of file + /// /// Product Ion types by dissociation method /// /// /// HCD ions were taken from the following paper: https://www.nature.com/articles/s41598-023-36193-2 /// Ion types below here should be validated with experimental results. /// Base and water losses occur very frequently and may also be present in these activation types. /// CID, UVPD, and aEPD ions were taken from the following paper: https://pubs.acs.org/doi/10.1021/acs.analchem.3c05428?ref=PDF /// NETD ions were taken from the following paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7161943/ /// lowCID ions were taken from this Thermo Poster: https://assets.thermofisher.com/TFS-Assets/CMD/Flyers/fl-489263-asms23-optimized-fragmentation-oligonucleotides-suppresses-undesired-fragmentation-fl489263-en.pdf /// public static Dictionary> ProductsFromDissociationType = new Dictionary>() { { DissociationType.Unknown, new List() }, { DissociationType.Custom, new List() }, { DissociationType.AnyActivationType, new List { ProductType.a, ProductType.aBaseLoss, ProductType.aWaterLoss, ProductType.b, ProductType.bBaseLoss, ProductType.bWaterLoss, ProductType.c, ProductType.cBaseLoss, ProductType.cWaterLoss, ProductType.d, ProductType.dBaseLoss, ProductType.dWaterLoss, ProductType.w, ProductType.wBaseLoss, ProductType.wWaterLoss, ProductType.x, ProductType.xBaseLoss, ProductType.xWaterLoss, ProductType.y, ProductType.yBaseLoss, ProductType.yWaterLoss, ProductType.z, ProductType.zBaseLoss, ProductType.zWaterLoss, ProductType.M } }, { DissociationType.CID, new List { ProductType.a, ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M } }, { DissociationType.HCD, new List { ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, ProductType.dWaterLoss, ProductType.w, ProductType.x, ProductType.y, ProductType.z, ProductType.M } }, { DissociationType.UVPD, new List { ProductType.a, ProductType.c, ProductType.d, ProductType.w, ProductType.M } }, { DissociationType.aEPD, new List { ProductType.a, ProductType.c, ProductType.d, ProductType.w, ProductType.x, ProductType.z, ProductType.M } }, { DissociationType.NETD, new List { ProductType.w, ProductType.d, ProductType.M } }, { DissociationType.LowCID, new List() { ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M } }, { DissociationType.IRMPD, new List() { } }, { DissociationType.ECD, new List { } }, { DissociationType.PQD, new List { } }, { DissociationType.ETD, new List { } }, { DissociationType.EThcD, new List { } }, }; + + /// /// Returns all dissociation types with implemented product type collections /// public static IEnumerable AllImplementedDissociationTypes => ProductsFromDissociationType.Where(p => p.Value.Any()) .Select(p => p.Key); /// /// Returns list of products types based upon the dissociation type /// /// /// public static List GetRnaProductTypesFromDissociationType(this DissociationType dissociationType) => ProductsFromDissociationType[dissociationType]; /// /// Mass to be added or subtracted /// private static readonly Dictionary FragmentIonCaps = new Dictionary { { ProductType.a, ChemicalFormula.ParseFormula("H") }, { ProductType.aWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, { ProductType.b, ChemicalFormula.ParseFormula("OH") }, { ProductType.bWaterLoss, ChemicalFormula.ParseFormula("H-1") }, { ProductType.c, ChemicalFormula.ParseFormula("O3H2P") }, { ProductType.cWaterLoss, ChemicalFormula.ParseFormula("O2P") }, { ProductType.d, ChemicalFormula.ParseFormula("O4H2P") }, { ProductType.dWaterLoss, ChemicalFormula.ParseFormula("O3P") }, { ProductType.w, ChemicalFormula.ParseFormula("H") }, { ProductType.wWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, { ProductType.x, ChemicalFormula.ParseFormula("O-1H") }, { ProductType.xWaterLoss, ChemicalFormula.ParseFormula("O-2H-1") }, { ProductType.y, ChemicalFormula.ParseFormula("O-3P-1") }, { ProductType.yWaterLoss, ChemicalFormula.ParseFormula("O-4H-2P-1") }, { ProductType.z, ChemicalFormula.ParseFormula("O-4P-1") }, { ProductType.zWaterLoss, ChemicalFormula.ParseFormula("O-5H-2P-1") }, //fragment - Base chemical formula is the corresponding fragment chemical formula subtracing 1 H as H is lost when base is removed { ProductType.aBaseLoss, ChemicalFormula.ParseFormula("H-2") }, // "H-1" -H { ProductType.bBaseLoss, ChemicalFormula.ParseFormula("O1H-2") }, //"OH1" -H { ProductType.cBaseLoss, ChemicalFormula.ParseFormula("O3H-1P") }, //"O3P" -H { ProductType.dBaseLoss, ChemicalFormula.ParseFormula("O4H-1P") }, //"O4H2P" -H { ProductType.wBaseLoss, ChemicalFormula.ParseFormula("H-2") }, //"H"-H { ProductType.xBaseLoss, ChemicalFormula.ParseFormula("O-1H-2") }, //"O-1H" -H { ProductType.yBaseLoss, ChemicalFormula.ParseFormula("O-3H-2P-1") }, //"O-3P-1" -H { ProductType.zBaseLoss, ChemicalFormula.ParseFormula("O-4H-3P-1") }, //"O-4H-1P-1" -1 { ProductType.M, new ChemicalFormula() } }; /// /// Returns mass shift by product type /// /// /// public static double GetRnaMassShiftFromProductType(this ProductType type) => FragmentIonCaps[type].MonoisotopicMass; public static FragmentationTerminus GetRnaTerminusType(this ProductType fragmentType) { switch (fragmentType) { case ProductType.a: case ProductType.aWaterLoss: case ProductType.aBaseLoss: case ProductType.b: case ProductType.bWaterLoss: case ProductType.bBaseLoss: case ProductType.c: case ProductType.cWaterLoss: case ProductType.cBaseLoss: case ProductType.d: case ProductType.dWaterLoss: case ProductType.dBaseLoss: return FragmentationTerminus.FivePrime; case ProductType.w: case ProductType.wWaterLoss: case ProductType.wBaseLoss: case ProductType.x: case ProductType.xWaterLoss: case ProductType.xBaseLoss: case ProductType.y: case ProductType.yWaterLoss: case ProductType.yBaseLoss: case ProductType.z: case ProductType.zWaterLoss: case ProductType.zBaseLoss: return FragmentationTerminus.ThreePrime; case ProductType.M: return FragmentationTerminus.None; case ProductType.aStar: case ProductType.aDegree: case ProductType.bAmmoniaLoss: case ProductType.yAmmoniaLoss: case ProductType.zPlusOne: case ProductType.D: case ProductType.Ycore: case ProductType.Y: default: throw new ArgumentOutOfRangeException(nameof(fragmentType), fragmentType, null); } } /// /// Product ion types by Fragmentation Terminus /// private static readonly Dictionary> ProductIonTypesFromSpecifiedTerminus = new Dictionary> { { FragmentationTerminus.FivePrime, new List { ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, } }, { FragmentationTerminus.ThreePrime, new List { ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, } }, { FragmentationTerminus.Both, new List { ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, ProductType.M } } }; public static List GetRnaTerminusSpecificProductTypes( this FragmentationTerminus fragmentationTerminus) { return ProductIonTypesFromSpecifiedTerminus[fragmentationTerminus]; } /// /// Returns all product ion types based upon specified terminus /// /// /// /// public static List GetRnaTerminusSpecificProductTypesFromDissociation( this DissociationType dissociationType, FragmentationTerminus fragmentationTerminus) { var terminusSpecific = fragmentationTerminus.GetRnaTerminusSpecificProductTypes(); var dissociationSpecific = dissociationType.GetRnaProductTypesFromDissociationType(); return terminusSpecific.Intersect(dissociationSpecific).ToList(); } } } \ No newline at end of file diff --git a/mzLib/Test/Transcriptomics/TestDigestion.cs b/mzLib/Test/Transcriptomics/TestDigestion.cs new file mode 100644 index 000000000..6b385be0f --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestDigestion.cs @@ -0,0 +1,780 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Linq; +using System.Security.Cryptography; +using Chemistry; +using MassSpectrometry; +using MathNet.Numerics.Distributions; +using NUnit.Framework; +using Omics.Digestion; +using Omics.Fragmentation; +using Omics.Modifications; +using Transcriptomics; +using Transcriptomics.Digestion; +using UsefulProteomicsDatabases; + +namespace Test.Transcriptomics +{ + [TestFixture] + [ExcludeFromCodeCoverage] + public class TestDigestion + { + public record RnaDigestionTestCase(string BaseSequence, string Enzyme, int MissedCleavages, int MinLength, + int MaxLength, int DigestionProductCount, + double[] MonoMasses, string[] Sequences); + + public static IEnumerable GetTestCases() + { + // 6bp Top Down + yield return new RnaDigestionTestCase("GUACUG", "top-down", + 0, 1, 6, 1, + new[] { 1874.28 }, + new[] { "GUACUG" }); + // 6bp Rnase T1, normal + yield return new RnaDigestionTestCase("GUACUG", "RNase T1", + 0, 1, 6, 2, + new[] { 363.057, 1529.234 }, + new[] { "G", "UACUG" }); + // 6bp Cusativin, normal + yield return new RnaDigestionTestCase("GUACUG", "Cusativin", + 0, 1, 6, 2, + new[] { 1303.175, 589.116 }, + new[] { "GUAC", "UG" }); + // 6bp Rnase T1, one product too short + yield return new RnaDigestionTestCase("GUACUG", "RNase T1", + 0, 3, 6, 1, + new[] { 1529.234 }, + new[] { "UACUG" }); + // 6bp Rnase T1, one product too long + yield return new RnaDigestionTestCase("GUACUG", "RNase T1", + 0, 1, 2, 1, + new[] { 363.057 }, + new[] { "G" }); + // 6bp Rnase T1, 1 missed cleavage + yield return new RnaDigestionTestCase("GUACUG", "RNase T1", + 1, 1, 6, 3, + new[] { 363.057, 1529.234, 1874.28 }, + new[] { "G", "UACUG", "GUACUG" }); + // 6bp Rnase A + yield return new RnaDigestionTestCase("GUACUG", "RNase A", + 0, 1, 6, 4, + new[] { 669.082, 652.103, 324.035, 283.091 }, + new[] { "GU", "AC", "U", "G" }); + // 6bp Rnase A, 1 missed cleavage + yield return new RnaDigestionTestCase("GUACUG", "RNase A", + 1, 1, 6, 7, + new[] { 669.082, 652.103, 324.035, 283.091, 1303.175, 958.128, 589.116 }, + new[] { "GU", "AC", "U", "G", "GUAC", "ACU", "UG" }); + // 6bp Rnase A, 2 missed cleavages + yield return new RnaDigestionTestCase("GUACUG", "RNase A", + 2, 1, 6, 9, + new[] { 669.082, 652.103, 324.035, 283.091, 1303.175, 958.128, 589.116, 1609.200, 1223.209 }, + new[] { "GU", "AC", "U", "G", "GUAC", "ACU", "UG", "GUACU", "ACUG" }); + // 20bp top-down + yield return new RnaDigestionTestCase("GUACUGCCUCUAGUGAAGCA", "top-down", + 0, 1, int.MaxValue, 1, + new[] { 6363.871 }, + new[] { "GUACUGCCUCUAGUGAAGCA" }); + // 20bp Rnase T1, normal + yield return new RnaDigestionTestCase("GUACUGCCUCUAGUGAAGCA", "RNase T1", + 0, 1, int.MaxValue, 6, + new[] { 363.057, 1609.200, 2219.282, 669.082, 1021.161, 572.137 }, + new[] { "G", "UACUG", "CCUCUAG", "UG", "AAG", "CA" }); + } + + public static string rnaseTsvpath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"Digestion\rnases.tsv"); + + [OneTimeSetUp] + public void OneTimeSetup() + { + RnaseDictionary.Dictionary = RnaseDictionary.LoadRnaseDictionary(rnaseTsvpath); + } + + #region Rnase + + [Test] + public void TestRnaseDictionaryLoading() + { + var rnaseCountFromTsv = File.ReadAllLines(rnaseTsvpath).Length - 1; + Assert.That(RnaseDictionary.Dictionary.Count, Is.EqualTo(rnaseCountFromTsv)); + } + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestRnase_GetUnmodifiedOligos_Counts(RnaDigestionTestCase testCase) + { + RNA rna = new RNA(testCase.BaseSequence); + Rnase rnase = RnaseDictionary.Dictionary[testCase.Enzyme]; + var digestionProducts = + rnase.GetUnmodifiedOligos(rna, testCase.MissedCleavages, testCase.MinLength, testCase.MaxLength); + + Assert.That(digestionProducts.Count(), Is.EqualTo(testCase.DigestionProductCount)); + } + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestRnase_GetUnmodifiedOligo_Sequence(RnaDigestionTestCase testCase) + { + RNA rna = new RNA(testCase.BaseSequence); + Rnase rnase = RnaseDictionary.Dictionary[testCase.Enzyme]; + var digestionProducts = + rnase.GetUnmodifiedOligos(rna, testCase.MissedCleavages, testCase.MinLength, testCase.MaxLength); + + Assert.That(digestionProducts.Count, Is.EqualTo(testCase.Sequences.Length)); + for (var i = 0; i < digestionProducts.Count; i++) + { + var product = digestionProducts[i]; + var testCaseCaseSequence = testCase.Sequences[i]; + Assert.That(product.BaseSequence == testCaseCaseSequence); + } + } + + [Test] + public void TestRnaseEqualityProperties() + { + Rnase t1 = RnaseDictionary.Dictionary["RNase T1"]; + Rnase t1Duplicate = RnaseDictionary.Dictionary["RNase T1"]; + Rnase t2 = RnaseDictionary.Dictionary["RNase T2"]; + + Assert.That(t1.Equals(t1Duplicate)); + Assert.That(t1.Equals(t1)); + Assert.That(!t1.Equals(t2)); + Assert.That(!t1.Equals(null)); + Assert.That(t1.GetHashCode(), Is.EqualTo(t1Duplicate.GetHashCode())); + Assert.That(t1.GetHashCode(), Is.Not.EqualTo(t2.GetHashCode())); + Assert.That(t1.Equals((object)t1Duplicate)); + Assert.That(t1.Equals((object)t1)); + Assert.That(!t1.Equals((object)t2)); + Assert.That(!t1.Equals((object)null)); + // ReSharper disable once SuspiciousTypeConversion.Global + Assert.That(!t1.Equals((object)new RNA("GUA"))); + } + + [Test] + public void TestRnase_UnmodifiedOligos_Exception() + { + Rnase rnase = new Rnase("Bad", CleavageSpecificity.SingleC, new List()); + Assert.Throws(() => { rnase.GetUnmodifiedOligos(new RNA("GUACUG"), 0, 1, 6); }); + } + + #endregion + + #region NucleolyticOligo + + [Test] + public void TestNucleolyticOligoProperties_FivePrimeDigestionProduct() + { + RNA rna = new("GUACUG"); + Rnase rnase = RnaseDictionary.Dictionary["RNase U2"]; + var digestionProducts = rnase.GetUnmodifiedOligos(rna, 0, 1, 6); + Assert.That(digestionProducts.Count, Is.EqualTo(3)); + + var oligo = digestionProducts[0]; + Assert.That(oligo.BaseSequence, Is.EqualTo("G")); + Assert.That(oligo.OneBasedStartResidue, Is.EqualTo(1)); + Assert.That(oligo.OneBasedEndResidue, Is.EqualTo(1)); + Assert.That(oligo.MissedCleavages, Is.EqualTo(0)); + Assert.That(oligo.CleavageSpecificityForFdrCategory, Is.EqualTo(CleavageSpecificity.Full)); + Assert.That(oligo.NextResidue, Is.EqualTo('U')); + Assert.That(oligo.PreviousResidue, Is.EqualTo('-')); + Assert.That(oligo.ToString(), Is.EqualTo(oligo.BaseSequence)); + } + + [Test] + public void TestNucleolyticOligoProperties_ThreePrimeDigestionProduct() + { + RNA rna = new("GUACUG"); + Rnase rnase = RnaseDictionary.Dictionary["RNase U2"]; + var digestionProducts = rnase.GetUnmodifiedOligos(rna, 0, 1, 6); + Assert.That(digestionProducts.Count, Is.EqualTo(3)); + + NucleolyticOligo oligo = digestionProducts[2]; + Assert.That(oligo.BaseSequence, Is.EqualTo("CUG")); + Assert.That(oligo.OneBasedStartResidue, Is.EqualTo(4)); + Assert.That(oligo.OneBasedEndResidue, Is.EqualTo(6)); + Assert.That(oligo.MissedCleavages, Is.EqualTo(0)); + Assert.That(oligo.CleavageSpecificityForFdrCategory, Is.EqualTo(CleavageSpecificity.Full)); + Assert.That(oligo.NextResidue, Is.EqualTo('-')); + Assert.That(oligo.PreviousResidue, Is.EqualTo('A')); + Assert.That(oligo.ToString(), Is.EqualTo(oligo.BaseSequence)); + } + + [Test] + public void TestNucleolyticOligoProperties_InternalDigestionProduct() + { + RNA rna = new("GUACUG"); + Rnase rnase = RnaseDictionary.Dictionary["RNase U2"]; + var digestionProducts = rnase.GetUnmodifiedOligos(rna, 0, 1, 6); + Assert.That(digestionProducts.Count, Is.EqualTo(3)); + + NucleolyticOligo oligo = digestionProducts[1]; + Assert.That(oligo.BaseSequence, Is.EqualTo("UA")); + Assert.That(oligo.OneBasedStartResidue, Is.EqualTo(2)); + Assert.That(oligo.OneBasedEndResidue, Is.EqualTo(3)); + Assert.That(oligo.MissedCleavages, Is.EqualTo(0)); + Assert.That(oligo.CleavageSpecificityForFdrCategory, Is.EqualTo(CleavageSpecificity.Full)); + Assert.That(oligo.NextResidue, Is.EqualTo('C')); + Assert.That(oligo.PreviousResidue, Is.EqualTo('G')); + Assert.That(oligo.ToString(), Is.EqualTo(oligo.BaseSequence)); + } + + [Test] + public void TestNucleolyticOligoProperties_TopDownDigestionProduct() + { + RNA rna = new("GUACUG"); + Rnase rnase = RnaseDictionary.Dictionary["top-down"]; + var digestionProducts = rnase.GetUnmodifiedOligos(rna, 0, 1, 6); + Assert.That(digestionProducts.Count, Is.EqualTo(1)); + + NucleolyticOligo oligo = digestionProducts[0]; + Assert.That(oligo.BaseSequence, Is.EqualTo("GUACUG")); + Assert.That(oligo.OneBasedStartResidue, Is.EqualTo(1)); + Assert.That(oligo.OneBasedEndResidue, Is.EqualTo(6)); + Assert.That(oligo.MissedCleavages, Is.EqualTo(0)); + Assert.That(oligo.CleavageSpecificityForFdrCategory, Is.EqualTo(CleavageSpecificity.Full)); + Assert.That(oligo.NextResidue, Is.EqualTo('-')); + Assert.That(oligo.PreviousResidue, Is.EqualTo('-')); + Assert.That(oligo.ToString(), Is.EqualTo(oligo.BaseSequence)); + } + + #endregion + + #region OligoWithSetMods + + private static (string Sequence, int FragmentNumber, ProductType Type, double Mass)[] DigestFragmentTestCases => + new (string Sequence, int FragmentNumber, ProductType Type, double Mass)[] + { + ("UAG", 0, ProductType.M, 998.134), + ("UAG", 1, ProductType.aBaseLoss, 114.031), ("UAG", 2, ProductType.aBaseLoss, 420.056), + ("UAG", 1, ProductType.c, 308.031), ("UAG", 2, ProductType.c, 637.093), + ("UAG", 1, ProductType.dWaterLoss, 306.025), ("UAG", 2, ProductType.dWaterLoss, 635.077), + ("UAG", 1, ProductType.w, 443.023), ("UAG", 2, ProductType.w, 772.075), + ("UAG", 1, ProductType.y, 363.057), ("UAG", 2, ProductType.y, 692.109), + ("UAG", 1, ProductType.yWaterLoss, 345.047), ("UAG", 2, ProductType.yWaterLoss, 674.100), + + ("UCG", 0, ProductType.M, 974.123), + ("UCG", 1, ProductType.aBaseLoss, 114.031), ("UCG", 2, ProductType.aBaseLoss, 420.056), + ("UCG", 1, ProductType.c, 308.040), ("UCG", 2, ProductType.c, 613.082), + ("UCG", 1, ProductType.dWaterLoss, 306.025), ("UCG", 2, ProductType.dWaterLoss, 611.066), + ("UCG", 1, ProductType.w, 443.023), ("UCG", 2, ProductType.w, 748.064), + ("UCG", 1, ProductType.y, 363.057), ("UCG", 2, ProductType.y, 668.098), + ("UCG", 1, ProductType.yWaterLoss, 345.047), ("UCG", 2, ProductType.yWaterLoss, 650.089), + + ("UUG", 0, ProductType.M, 975.107), + ("UUG", 1, ProductType.aBaseLoss, 114.031), ("UUG", 2, ProductType.aBaseLoss, 420.056), + ("UUG", 1, ProductType.c, 308.041), ("UUG", 2, ProductType.c, 614.066), + ("UUG", 1, ProductType.dWaterLoss, 306.025), ("UUG", 2, ProductType.dWaterLoss, 612.050), + ("UUG", 1, ProductType.w, 443.023), ("UUG", 2, ProductType.w, 749.048), + ("UUG", 1, ProductType.y, 363.057), ("UUG", 2, ProductType.y, 669.082), + ("UUG", 1, ProductType.yWaterLoss, 345.047), ("UUG", 2, ProductType.yWaterLoss, 651.073), + + ("AUAG", 0, ProductType.M, 1247.220), + ("AUAG", 1, ProductType.aBaseLoss, 114.031), ("AUAG", 2, ProductType.aBaseLoss, 443.083), ("AUAG", 3, ProductType.aBaseLoss, 749.108), + ("AUAG", 1, ProductType.c, 331.068), ("AUAG", 2, ProductType.c, 637.093), ("AUAG", 3, ProductType.c, 966.146), + ("AUAG", 1, ProductType.dWaterLoss, 329.052), ("AUAG", 2, ProductType.dWaterLoss, 635.077), ("AUAG", 3, ProductType.dWaterLoss, 964.129), + ("AUAG", 1, ProductType.w, 363.057), ("AUAG", 2, ProductType.w, 692.109), ("AUAG", 3, ProductType.w, 998.134), + ("AUAG", 1, ProductType.y, 283.091), ("AUAG", 2, ProductType.y, 612.143), ("AUAG", 3, ProductType.y, 918.168), + ("AUAG", 1, ProductType.yWaterLoss, 265.081), ("AUAG", 2, ProductType.yWaterLoss, 594.134), ("AUAG", 3, ProductType.yWaterLoss, 900.159), + }; + + [Test] // test values calculated with http://rna.rega.kuleuven.be/masspec/mongo.htm + [TestCase("UAGUCGUUGAUAG", 4140.555, new[] { "UAG", "UCG", "UUG", "AUAG" }, + new[] { 998.134, 974.123, 975.107, 1247.220 })] + public static void TestDigestionAndFragmentation(string sequence, double monoMass, + string[] digestionProductSequences, double[] digestionProductMasses) + { + RNA rna = new(sequence); + Assert.That(rna.MonoisotopicMass, Is.EqualTo(monoMass).Within(0.01)); + + // digest RNA + var digestionParams = new RnaDigestionParams("RNase T1"); + var products = rna.Digest(digestionParams, new List(), new List()) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(products.Count, Is.EqualTo(digestionProductSequences.Length)); + + // ensure digestion sequence and masses are correct + for (var index = 0; index < products.Count; index++) + { + var digestionProduct = products[index]; + Assert.That(digestionProduct.BaseSequence, Is.EqualTo(digestionProductSequences[index])); + Assert.That(digestionProduct.MonoisotopicMass, Is.EqualTo(digestionProductMasses[index]).Within(0.01)); + + List fragments = new(); + digestionProduct.Fragment(DissociationType.CID, FragmentationTerminus.Both, fragments); + + List<(int FragmentNumber, ProductType Type, double Mass)[]> ughh = new(); + + // test that fragments are correct + var fragmentsToCompare = DigestFragmentTestCases + .Where(p => p.Sequence.Equals(digestionProduct.BaseSequence)).ToList(); + for (var i = 0; i < fragments.Count; i++) + { + var fragment = fragments[i]; + var theoreticalFragment = fragmentsToCompare[i]; + Assert.That(fragment.MonoisotopicMass, Is.EqualTo(theoreticalFragment.Mass).Within(0.01)); + Assert.That(fragment.FragmentNumber, Is.EqualTo(theoreticalFragment.FragmentNumber)); + Assert.That(fragment.ProductType, Is.EqualTo(theoreticalFragment.Type)); + Assert.That(fragment.FragmentNumber, Is.EqualTo(theoreticalFragment.FragmentNumber)); + if (fragment.Terminus == FragmentationTerminus.FivePrime) + Assert.That(fragment.AminoAcidPosition, Is.EqualTo(theoreticalFragment.FragmentNumber)); + else if (fragment.Terminus == FragmentationTerminus.None) + Assert.That(fragment.FragmentNumber, Is.EqualTo(0)); + else + Assert.That(fragment.AminoAcidPosition, Is.EqualTo(digestionProductSequences[index].Length - theoreticalFragment.FragmentNumber)); + } + } + } + + [Test] + [TestCase("UAGUCGUUGAUAG", new[] { "UAG", "UCG", "UUG", "AUAG" }, + new[] { 1, 4, 7, 10 }, new[] { 3, 6, 9, 13 }, new[] { '-', 'G', 'G', 'G' }, + new[] { 'U', 'U', 'A', '-' })] + public static void TestOligoWithSetMods_AAPositions(string sequence, string[] digestionProductSequences, + int[] startResidue, int[] endResidue, char[] preciousResidue, char[] nextResidue) + { + RNA rna = new RNA(sequence); + var digestionProducts = rna.Digest(new RnaDigestionParams("RNase T1"), new List(), + new List()).Select(p => (OligoWithSetMods)p).ToList(); + + Assert.That(digestionProducts.All(p => p.DigestionParams.DigestionAgent.Name == "RNase T1")); + for (var index = 0; index < digestionProducts.Count; index++) + { + var digestionProduct = digestionProducts[index]; + Assert.That(digestionProduct.BaseSequence, Is.EqualTo(digestionProductSequences[index])); + Assert.That(digestionProduct.OneBasedStartResidue, Is.EqualTo(startResidue[index])); + Assert.That(digestionProduct.OneBasedEndResidue, Is.EqualTo(endResidue[index])); + Assert.That(digestionProduct.PreviousResidue, Is.EqualTo(preciousResidue[index])); + Assert.That(digestionProduct.NextResidue, Is.EqualTo(nextResidue[index])); + } + } + + [Test] + public static void TestTermini_ThreePrimeCyclicPhosphate() + { + string sequence = "UAGUCGUUGAUAG"; + RNA rna = new RNA(sequence); + var oligoCyclicPhosphate = PtmListLoader.ReadModsFromString( + "ID Cyclic Phosphate\r\nTG X\r\nPP Oligo 3'-terminal.\r\nMT Digestion Termini\r\nCF H-2 O-1\r\nDR Unimod; 280.\r\n//", + out List<(Modification, string)> errors).First(); + var nucleicAcidCyclicPhosphate = PtmListLoader.ReadModsFromString( + "ID Cyclic Phosphate\r\nTG X\r\nPP 3'-terminal.\r\nMT Digestion Termini\r\nCF H-2 O-1\r\nDR Unimod; 280.\r\n//", + out errors).First(); + Assert.That(!errors.Any()); + + // top-down digestion, 3' terminal modification + var variableMods = new List { nucleicAcidCyclicPhosphate }; + var digestionParams = new RnaDigestionParams("top-down"); + var digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(2)); + Assert.That(digestionProducts[0].FullSequence, Is.EqualTo("UAGUCGUUGAUAG")); + Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG[Digestion Termini:Cyclic Phosphate on X]")); + + // top-down digestion, 3' oligo terminal modification + variableMods = new List { oligoCyclicPhosphate }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(1)); + Assert.That(digestionProducts[0].FullSequence, Is.EqualTo("UAGUCGUUGAUAG")); + + // RNase T1 digestion, 3' terminal modification + digestionParams = new RnaDigestionParams("RNase T1"); + variableMods = new List { nucleicAcidCyclicPhosphate }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(5)); + var expected = new List() + { + "UAG", "UCG", "UUG", "AUAG", "AUAG[Digestion Termini:Cyclic Phosphate on X]" + }; + for (int i = 0; i < expected.Count; i++) + { + Assert.That(digestionProducts[i].FullSequence, Is.EqualTo(expected[i])); + } + + // RNase T1 digestion, 3' oligo terminal modification + variableMods = new List { oligoCyclicPhosphate }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(7)); + expected = new List() + { + "UAG", "UAG[Digestion Termini:Cyclic Phosphate on X]", + "UCG", "UCG[Digestion Termini:Cyclic Phosphate on X]", + "UUG", "UUG[Digestion Termini:Cyclic Phosphate on X]", + "AUAG", + }; + + for (int i = 0; i < expected.Count; i++) + { + Assert.That(digestionProducts[i].FullSequence, Is.EqualTo(expected[i])); + } + } + + [Test] + public static void TestTermini_FivePrimeLargeMod() + { + string sequence = "UAGUCGUUGAUAG"; + RNA rna = new RNA(sequence); + var oligoLargeMod = PtmListLoader.ReadModsFromString( + "ID Pfizer 5'-Cap\r\nTG X\r\nPP Oligo 5'-terminal.\r\nMT Standard\r\nCF C13H22N5O14P3\r\nDR Unimod; 280.\r\n//", + out List<(Modification, string)> errors).First(); + var nucleicAcidLargeMod = PtmListLoader.ReadModsFromString( + "ID Pfizer 5'-Cap\r\nTG X\r\nPP 5'-terminal.\r\nMT Standard\r\nCF C13H22N5O14P3\r\nDR Unimod; 280.\r\n//", + out errors).First(); + Assert.That(!errors.Any()); + + // top-down digestion, 5' terminal modification + var variableMods = new List { nucleicAcidLargeMod }; + var digestionParams = new RnaDigestionParams("top-down"); + var digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(2)); + Assert.That(digestionProducts[0].FullSequence, Is.EqualTo("UAGUCGUUGAUAG")); + Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("[Standard:Pfizer 5'-Cap on X]UAGUCGUUGAUAG")); + + // top-down digestion, 5' oligo terminal modification + variableMods = new List { oligoLargeMod }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(1)); + Assert.That(digestionProducts[0].FullSequence, Is.EqualTo("UAGUCGUUGAUAG")); + + // RNase T1 digestion, 5' terminal modification + digestionParams = new RnaDigestionParams("RNase T1"); + variableMods = new List { nucleicAcidLargeMod }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(5)); + var expected = new List() + { + "UAG", "[Standard:Pfizer 5'-Cap on X]UAG", "UCG", "UUG", "AUAG" + }; + for (int i = 0; i < expected.Count; i++) + { + Assert.That(digestionProducts[i].FullSequence, Is.EqualTo(expected[i])); + } + + // RNase T1 digestion, 5' oligo terminal modification + variableMods = new List { oligoLargeMod }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(7)); + expected = new List() + { + "UAG", + "UCG", "[Standard:Pfizer 5'-Cap on X]UCG", + "UUG", "[Standard:Pfizer 5'-Cap on X]UUG", + "AUAG", "[Standard:Pfizer 5'-Cap on X]AUAG" + }; + + for (int i = 0; i < expected.Count; i++) + { + Assert.That(digestionProducts[i].FullSequence, Is.EqualTo(expected[i])); + } + } + + [Test] + [TestCase("UAGUCGUUGAUAG")] + public static void TestOligoWithSetMods_PropertiesWithTopDownDigestion(string sequence) + { + var rna = new RNA(sequence); + var oligoWithSetMods = + rna.Digest(new RnaDigestionParams(), new List(), new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + + Assert.That(rna.BaseSequence, Is.EqualTo(oligoWithSetMods.BaseSequence)); + Assert.That(rna.ThreePrimeTerminus, Is.EqualTo(oligoWithSetMods.ThreePrimeTerminus)); + Assert.That(rna.FivePrimeTerminus, Is.EqualTo(oligoWithSetMods.FivePrimeTerminus)); + Assert.That(rna.ThisChemicalFormula, Is.EqualTo(oligoWithSetMods.ThisChemicalFormula)); + Assert.That(rna.Length, Is.EqualTo(oligoWithSetMods.Length)); + } + + [Test] + public static void OligoWithSetMods_CalculatedValues() + { + var rna = new RNA("GUACUG"); + var rnaFormula = rna.ThisChemicalFormula; + + string modText = "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//"; + var sodiumAdduct = PtmListLoader.ReadModsFromString(modText, out List<(Modification, string)> mods).First(); + var oligoWithSetMods = + rna.Digest(new RnaDigestionParams(), new List() { sodiumAdduct }, new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + + Assert.That(oligoWithSetMods.NumMods, Is.EqualTo(1)); + Assert.That(oligoWithSetMods.NumFixedMods, Is.EqualTo(1)); + Assert.That(oligoWithSetMods.NumVariableMods, Is.EqualTo(0)); + + var formula = oligoWithSetMods.ThisChemicalFormula; + Assert.That(formula, Is.EqualTo(rnaFormula + sodiumAdduct.ChemicalFormula)); + + var formulaToAdd = ChemicalFormula.ParseFormula("H"); + var deltaMass = formulaToAdd.MonoisotopicMass; + var oldMonoMass = oligoWithSetMods.MonoisotopicMass; + var oldMostAbundantMass = oligoWithSetMods.MostAbundantMonoisotopicMass; + + oligoWithSetMods.FivePrimeTerminus = formulaToAdd + oligoWithSetMods.FivePrimeTerminus; + + Assert.That(oligoWithSetMods.MonoisotopicMass, Is.EqualTo(oldMonoMass + deltaMass).Within(0.01)); + Assert.That(oligoWithSetMods.MostAbundantMonoisotopicMass, Is.EqualTo(oldMostAbundantMass + deltaMass).Within(0.01)); + Assert.That(oligoWithSetMods.ThisChemicalFormula, Is.EqualTo(formula + formulaToAdd)); + + oldMonoMass = oligoWithSetMods.MonoisotopicMass; + oldMostAbundantMass = oligoWithSetMods.MostAbundantMonoisotopicMass; + oligoWithSetMods.ThreePrimeTerminus = formulaToAdd + oligoWithSetMods.ThreePrimeTerminus; + + Assert.That(oligoWithSetMods.MonoisotopicMass, Is.EqualTo(oldMonoMass + deltaMass).Within(0.01)); + Assert.That(oligoWithSetMods.MostAbundantMonoisotopicMass, Is.EqualTo(oldMostAbundantMass + deltaMass).Within(0.01)); + Assert.That(oligoWithSetMods.ThisChemicalFormula, Is.EqualTo(formula + formulaToAdd + formulaToAdd)); + } + + #endregion + + #region DigestionParams + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestDigestionParams_Properties(RnaDigestionTestCase testCase) + { + var rna = new RNA(testCase.BaseSequence); + var digestionParams = new RnaDigestionParams(testCase.Enzyme, testCase.MissedCleavages, testCase.MinLength, + testCase.MaxLength); + + Assert.That(digestionParams.DigestionAgent, Is.EqualTo(RnaseDictionary.Dictionary[testCase.Enzyme])); + Assert.That(digestionParams.MaxMissedCleavages, Is.EqualTo(testCase.MissedCleavages)); + Assert.That(digestionParams.MinLength, Is.EqualTo(testCase.MinLength)); + Assert.That(digestionParams.MaxLength, Is.EqualTo(testCase.MaxLength)); + + digestionParams.MaxModificationIsoforms = 2048; + digestionParams.MaxMods = 3; + Assert.That(digestionParams.MaxModificationIsoforms, Is.EqualTo(2048)); + Assert.That(digestionParams.MaxMods, Is.EqualTo(3)); + + var digestionProducts = rna.Digest(digestionParams, new List(), new List()); + Assert.That(digestionProducts.Count(), Is.EqualTo(testCase.DigestionProductCount)); + } + + #endregion + + #region NucleicAcid + + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestNucleicAcid_Digestion_WithoutMods_Counts(RnaDigestionTestCase testCase) + { + var rna = new RNA(testCase.BaseSequence); + var digestionParams = new RnaDigestionParams(testCase.Enzyme, testCase.MissedCleavages, testCase.MinLength, + testCase.MaxLength); + + var digestionProducts = rna.Digest(digestionParams, new List(), new List()); + Assert.That(digestionProducts.Count(), Is.EqualTo(testCase.DigestionProductCount)); + } + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestNucleicAcid_Digestion_WithoutMods_Sequences(RnaDigestionTestCase testCase) + { + var rna = new RNA(testCase.BaseSequence); + var digestionParams = new RnaDigestionParams(testCase.Enzyme, testCase.MissedCleavages, testCase.MinLength, + testCase.MaxLength); + + var digestionProducts = rna.Digest(digestionParams, new List(), new List()) + .ToList(); + + Assert.That(digestionProducts.Count, Is.EqualTo(testCase.Sequences.Length)); + for (var i = 0; i < digestionProducts.Count; i++) + { + var product = digestionProducts[i]; + var testCaseCaseSequence = testCase.Sequences[i]; + Assert.That(product.BaseSequence, Is.EqualTo(testCaseCaseSequence)); + Assert.That(product.FullSequence, Is.EqualTo(testCaseCaseSequence)); + } + } + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestNucleicAcid_Digestion_WithoutMods_MonoMasses(RnaDigestionTestCase testCase) + { + var rna = new RNA(testCase.BaseSequence); + var digestionParams = new RnaDigestionParams(testCase.Enzyme, testCase.MissedCleavages, testCase.MinLength, + testCase.MaxLength); + + var digestionProducts = rna.Digest(digestionParams, new List(), new List()) + .ToList(); + + Assert.That(digestionProducts.Count, Is.EqualTo(testCase.Sequences.Length)); + for (var i = 0; i < digestionProducts.Count; i++) + { + var productMass = digestionProducts[i].MonoisotopicMass; + var testCaseCaseMass = testCase.MonoMasses[i]; + Assert.That(productMass, Is.EqualTo(testCaseCaseMass).Within(0.01)); + } + } + + #endregion + + #region Digestion with Modifications + + [Test] + public static void TestVariableModsCountCorrect() + { + string modText = "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A or C or G or U\r\nCF Na1H-1\r\n" + @"//"; + var sodiumAdducts = PtmListLoader.ReadModsFromString(modText, out List<(Modification, string)> mods) + .ToList(); + Assert.That(sodiumAdducts.Count, Is.EqualTo(4)); + + var rna = new RNA("GUACUG"); + var rnaDigestionParams = new RnaDigestionParams() + { + MaxMods = 1, + }; + + var precursors = rna.Digest(rnaDigestionParams, new List(), sodiumAdducts) + .ToList(); + Assert.That(precursors.Count, Is.EqualTo(7)); + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UACUG")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]ACUG")); + Assert.That(fullSequences.Contains("GUA[Metal:Sodium on A]CUG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG")); + Assert.That(fullSequences.Contains("GUACU[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + + rnaDigestionParams.MaxMods = 2; + precursors = rna.Digest(rnaDigestionParams, new List(), sodiumAdducts) + .ToList(); + Assert.That(precursors.Count, Is.EqualTo(22)); + fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UACUG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]U[Metal:Sodium on U]ACUG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UA[Metal:Sodium on A]CUG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UAC[Metal:Sodium on C]UG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UACU[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]ACUG")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]A[Metal:Sodium on A]CUG")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]AC[Metal:Sodium on C]UG")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]ACU[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]ACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUA[Metal:Sodium on A]CUG")); + Assert.That(fullSequences.Contains("GUA[Metal:Sodium on A]C[Metal:Sodium on C]UG")); + Assert.That(fullSequences.Contains("GUA[Metal:Sodium on A]CU[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("GUA[Metal:Sodium on A]CUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]U[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACU[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("GUACU[Metal:Sodium on U]G[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + } + + [Test] + public static void TestFixedModsCountCorrect() + { + string modText = "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//"; + var sodiumAdduct = PtmListLoader.ReadModsFromString(modText, out List<(Modification, string)> mods) + .ToList(); + + var rna = new RNA("GUACUG"); + var rnaDigestionParams = new RnaDigestionParams() + { + MaxMods = 1, + }; + var precursors = rna.Digest(rnaDigestionParams, sodiumAdduct, new List()) + .ToList(); + Assert.That(precursors.Count, Is.EqualTo(1)); + Assert.That(precursors.First().NumFixedMods, Is.EqualTo(1)); + Assert.That(precursors.First().FullSequence, Is.EqualTo("GUA[Metal:Sodium on A]CUG")); + Assert.That(precursors.First().MonoisotopicMass, Is.EqualTo(1896.26).Within(0.01)); + + modText = "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG G\r\nCF Na1H-1\r\n" + @"//"; + sodiumAdduct = PtmListLoader.ReadModsFromString(modText, out mods) + .ToList(); + + precursors = rna.Digest(rnaDigestionParams, sodiumAdduct, new List()) + .ToList(); + Assert.That(precursors.Count, Is.EqualTo(1)); + Assert.That(precursors.First().NumFixedMods, Is.EqualTo(2)); + Assert.That(precursors.First().FullSequence, Is.EqualTo("G[Metal:Sodium on G]UACUG[Metal:Sodium on G]")); + Assert.That(precursors.First().MonoisotopicMass, Is.EqualTo(1918.25).Within(0.01)); + } + + [Test] + public static void TestFixedAndVariableMods() + { + string modText = "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A or C or G or U\r\nCF Na1H-1\r\n" + @"//"; + string modText2 = "ID Potassium\r\nMT Metal\r\nPP Anywhere.\r\nTG A or C or G or U\r\nCF K1H-1\r\n" + @"//"; + var sodiumAdducts = PtmListLoader.ReadModsFromString(modText, out List<(Modification, string)> mods) + .ToList(); + var potassiumAdducts = PtmListLoader.ReadModsFromString(modText2, out mods) + .ToList(); + + Assert.That(sodiumAdducts.Count, Is.EqualTo(4)); + Assert.That(potassiumAdducts.Count, Is.EqualTo(4)); + + var rna = new RNA("GUACUG"); + var rnaDigestionParams = new RnaDigestionParams(); + + rnaDigestionParams.MaxMods = 1; + var fixedMods = new List { potassiumAdducts[0] }; // A + var variableMods = new List { sodiumAdducts[1] }; // C + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(2)); + Assert.That(precursors.All(p => p.NumFixedMods == 1)); + Assert.That(fullSequences.Contains("GUA[Metal:Potassium on A]CUG")); + Assert.That(fullSequences.Contains("GUA[Metal:Potassium on A]C[Metal:Sodium on C]UG")); + + var oneOfEach = precursors.First(p => p.FullSequence.Equals("GUA[Metal:Potassium on A]C[Metal:Sodium on C]UG")); + Assert.That(oneOfEach.NumFixedMods, Is.EqualTo(1)); + Assert.That(oneOfEach.NumVariableMods, Is.EqualTo(1)); + Assert.That(oneOfEach.NumMods, Is.EqualTo(2)); + + fixedMods = new List { potassiumAdducts[2] }; // G + variableMods = new List { sodiumAdducts[1] }; // C + precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(2)); + Assert.That(precursors.All(p => p.NumFixedMods == 2)); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UAC[Metal:Sodium on C]UG[Metal:Potassium on G]")); + + fixedMods = new List { potassiumAdducts[2] }; // G + variableMods = new List { sodiumAdducts[1], sodiumAdducts[3] }; // C, U + precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(4)); + Assert.That(precursors.All(p => p.NumFixedMods == 2)); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UAC[Metal:Sodium on C]UG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACU[Metal:Sodium on U]G[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]U[Metal:Sodium on U]ACUG[Metal:Potassium on G]")); + + rnaDigestionParams.MaxMods = 2; + precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(7)); + Assert.That(precursors.All(p => p.NumFixedMods == 2)); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UAC[Metal:Sodium on C]UG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]U[Metal:Sodium on U]ACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACU[Metal:Sodium on U]G[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]U[Metal:Sodium on U]ACU[Metal:Sodium on U]G[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UAC[Metal:Sodium on C]U[Metal:Sodium on U]G[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]U[Metal:Sodium on U]AC[Metal:Sodium on C]UG[Metal:Potassium on G]")); + } + + #endregion + } +} diff --git a/mzLib/Test/Transcriptomics/TestFragmentation.cs b/mzLib/Test/Transcriptomics/TestFragmentation.cs new file mode 100644 index 000000000..6086ecb70 --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestFragmentation.cs @@ -0,0 +1,239 @@ +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Text; +using System.Threading.Tasks; +using Transcriptomics; +using MassSpectrometry; +using Omics.Fragmentation; +using Omics.Fragmentation.Oligo; +using Omics.Modifications; +using Transcriptomics.Digestion; +using UsefulProteomicsDatabases; + +namespace Test.Transcriptomics +{ + [TestFixture] + [ExcludeFromCodeCoverage] + internal class TestFragmentation + { + + internal static IEnumerable GetSixMerIndividualFragmentTypeTestCases() => + TestNucleicAcid.GetSixmerIndividualFragmentTypeTestCases(); + + [Test] + [TestCaseSource(nameof(GetSixMerIndividualFragmentTypeTestCases))] + public void TestGetNeutralFragments(TestNucleicAcid.SixmerTestCase testCase) + { + var rna = new RNA("GUACUG") + .Digest(new RnaDigestionParams(), new List(), new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + + var neutralFragments = rna.GetNeutralFragments(testCase.Type).ToList(); + for (int i = 1; i < neutralFragments.Count; i++) + { + Assert.That(neutralFragments[i].NeutralMass, Is.EqualTo(testCase.NeutralMasses[i]).Within(0.01)); + } + } + + + private static IEnumerable ImplementedDissociationTypes + { + get + { + Loaders.LoadElements(); + foreach (var type in DissociationTypeCollection.AllImplementedDissociationTypes) + yield return type; + } + } + + /// + /// This test makes the assumption that the M ion is a component of all product types + /// + /// + [Test] + [TestCaseSource(nameof(ImplementedDissociationTypes))] + public void TestFragmentation_Unmodified_ProductCountsAreCorrect(DissociationType type) + { + Loaders.LoadElements(); + List products = new(); + var rnaToTest = new List + { + new RNA("GUACUG"), + new RNA("GUACUGCACUGU"), + new RNA("GUACUGUAAUGAGACUAGUACAUGACAUG"), + }; + var terminiToTest = new List { FragmentationTerminus.Both, FragmentationTerminus.FivePrime, FragmentationTerminus.ThreePrime }; + var potentialProducts = type.GetRnaProductTypesFromDissociationType(); + + // test with top down digestion and no modifications + var digestionparams = new RnaDigestionParams(rnase: "top-down"); + var fixedMods = new List(); + var variableMods = new List(); + foreach (var term in terminiToTest) + { + foreach (var oligoWithSetMods in rnaToTest.Select(rna => rna.Digest(digestionparams, fixedMods, variableMods).First())) + { + var terminalSpecifc = term == FragmentationTerminus.Both + ? potentialProducts + : potentialProducts.Where(p => p.GetRnaTerminusType() == term).ToList(); + + var expectedProductCount = term == FragmentationTerminus.Both + ? (oligoWithSetMods.Length - 1) * (terminalSpecifc.Count - 1) + 1 // there is only one M ion, so for both, remove that form muliplier and add one + : (oligoWithSetMods.Length - 1) * terminalSpecifc.Count; + + oligoWithSetMods.Fragment(type, term, products); + Assert.That(products.Count, Is.EqualTo(expectedProductCount)); + Assert.That(products.All(p => terminalSpecifc.Contains(p.ProductType))); + } + } + } + + [Test] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.a, + new[] { 267.089, 573.114, 902.167, 1207.208, 1513.233 }, + new[] { 267.089, 573.114, 902.167 + 21.982, 1207.208 + 21.982, 1513.233 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.b, + new[] { 283.084, 589.109, 918.162, 1223.203, 1529.228 }, + new[] { 283.084, 589.109, 918.162 + 21.982, 1223.203 + 21.982, 1529.228 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.c, + new[] { 347.055, 653.081, 982.133, 1287.174, 1593.2 }, + new[] { 347.055, 653.081, 982.133 + 21.982, 1287.174 + 21.982, 1593.2 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.d, + new[] { 363.05, 669.075, 998.128, 1303.169, 1609.195 }, + new[] { 363.05, 669.075, 998.128 + 21.982, 1303.169 + 21.982, 1609.195 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.dWaterLoss, + new[] { 345.039, 651.064, 980.116, 1285.157, 1591.184 }, + new[] { 345.039, 651.064, 980.116 + 21.982, 1285.157 + 21.982, 1591.184 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.w, + new[] { 363.049, 669.074, 974.115, 1303.169, 1609.195 }, + new[] { 363.049, 669.074, 974.115, 1303.169 + 21.982, 1609.195 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.x, + new[] { 347.055, 653.081, 958.122, 1287.174, 1593.2 }, + new[] { 347.055, 653.081, 958.122, 1287.174 + 21.982, 1593.2 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.y, + new[] { 283.084, 589.109, 894.15, 1223.203, 1529.228 }, + new[] { 283.084, 589.109, 894.15, 1223.203 + 21.982, 1529.228 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.z, + new[] { 267.089, 573.124, 878.156, 1207.208, 1513.233 }, + new[] { 267.089, 573.124, 878.156, 1207.208 + 21.982, 1513.233 + 21.982 })] + public void TestFragmentation_Modified(string sequence, string modString, string fullSequence, double unmodifiedMass, double modifiedMass, + ProductType productType, double[] unmodifiedFragmentMass, double[] modifiedFragmentMasses) + { + var mods = PtmListLoader.ReadModsFromString(modString, out List<(Modification, string)> modsOut).ToList(); + var rna = new RNA(sequence); + + var unmodifiedOligo = rna.Digest(new RnaDigestionParams(), new List(), new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + Assert.That(unmodifiedOligo.AllModsOneIsNterminus.Count, Is.EqualTo(0)); + Assert.That(unmodifiedOligo.FullSequence, Is.EqualTo(sequence)); + Assert.That(unmodifiedOligo.MonoisotopicMass, Is.EqualTo(unmodifiedMass).Within(0.01)); + + var modifiedOligo = rna.Digest(new RnaDigestionParams(), mods, new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + Assert.That(modifiedOligo.AllModsOneIsNterminus.Count, Is.EqualTo(mods.Count)); + Assert.That(modifiedOligo.FullSequence, Is.EqualTo(fullSequence)); + Assert.That(modifiedOligo.MonoisotopicMass, Is.EqualTo(modifiedMass).Within(0.01)); + + var unmodifiedProducts = unmodifiedOligo.GetNeutralFragments(productType).ToList(); + Assert.That(unmodifiedProducts.Count, Is.EqualTo(5)); + var modifiedProducts = modifiedOligo.GetNeutralFragments(productType).ToList(); + Assert.That(modifiedProducts.Count, Is.EqualTo(5)); + + + for (int i = 0; i < unmodifiedProducts.Count; i++) + { + var unModifedProduct = unmodifiedProducts[i]; + var modifiedProduct = modifiedProducts[i]; + + Assert.That(unModifedProduct.NeutralMass, Is.EqualTo(unmodifiedFragmentMass[i]).Within(0.01)); + Assert.That(modifiedProduct.NeutralMass, Is.EqualTo(modifiedFragmentMasses[i]).Within(0.01)); + } + } + + + [Test] + [TestCaseSource(nameof(GetSixMerIndividualFragmentTypeTestCases))] + public void TestRnaFragments(TestNucleicAcid.SixmerTestCase testCase) + { + var rna = new RNA("GUACUG") + .Digest(new RnaDigestionParams(), new List(), new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + List products = rna.GetNeutralFragments(testCase.Type).Select(p => (Product)p).ToList(); + + for (int i = 0; i < products.Count; i++) + { + var product = products[i]; + Assert.That(testCase.Type, Is.EqualTo(product.ProductType)); + Assert.That(testCase.Type.GetRnaTerminusType(), Is.EqualTo(product.Terminus)); + Assert.That(testCase.NeutralMasses[i], Is.EqualTo(product.NeutralMass).Within(0.01)); + Assert.That(testCase.NeutralMasses[i], Is.EqualTo(product.MonoisotopicMass).Within(0.01)); + Assert.That(0, Is.EqualTo(product.NeutralLoss)); + Assert.That(null, Is.EqualTo(product.SecondaryProductType)); + Assert.That(0, Is.EqualTo(product.SecondaryFragmentNumber)); + + string annotation = $"{product.ProductType}{product.FragmentNumber}"; + Assert.That(annotation, Is.EqualTo(product.Annotation)); + string toString = + $"{product.ProductType}{product.FragmentNumber};{product.NeutralMass:F5}-{product.NeutralLoss:0.##}"; + Assert.That(toString, Is.EqualTo(product.ToString())); + } + } + + [Test] + [TestCaseSource(nameof(GetSixMerIndividualFragmentTypeTestCases))] + public void TestRnaFragmentNumbers(TestNucleicAcid.SixmerTestCase testCase) + { + var rna = new RNA("GUACUG") + .Digest(new RnaDigestionParams(), new List(), new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + List products = rna.GetNeutralFragments(testCase.Type).Select(p => (Product)p).ToList(); + + for (int i = 0; i < products.Count; i++) + { + var product = products[i]; + bool isThreePrime = product.ProductType.GetRnaTerminusType() == FragmentationTerminus.ThreePrime; + + int fragmentNumber = i + 1; + int residuePosition = isThreePrime ? rna.Length - fragmentNumber : fragmentNumber; + + Assert.That(product.FragmentNumber, Is.EqualTo(fragmentNumber)); + Assert.That(product.ResiduePosition, Is.EqualTo(residuePosition)); + } + + } + + [Test] + public void TestConstructorAndEquality() + { + Product product1 = new Product(ProductType.d, FragmentationTerminus.FivePrime, 200, 4, 4, 0.0); + Product product2 = new Product(ProductType.d, FragmentationTerminus.FivePrime, 200, 4, 4, 0.0); + Product uniqueProduct = new Product(ProductType.a, FragmentationTerminus.FivePrime, 201, 4, 4, 0.0); + + Assert.That(product1.Equals(product1)); + Assert.That(product1.Equals(product2)); + Assert.That(product1.GetHashCode(), Is.EqualTo(product2.GetHashCode())); + Assert.That(!product1.Equals(uniqueProduct)); + Assert.That(!product1.Equals(null)); + Assert.That(product1.GetHashCode(), Is.Not.EqualTo(uniqueProduct.GetHashCode())); + + Assert.That(product1.Equals((object)product1)); + Assert.That(product1.Equals((object)product2)); + Assert.That(!product1.Equals((object)uniqueProduct)); + Assert.That(!product1.Equals((object)new Product(ProductType.d, FragmentationTerminus.N, 200, 4, 4, 0.0))); + Assert.That(!product1.Equals((object)null)); + } + } +} diff --git a/mzLib/Test/Transcriptomics/TestNucleicAcid.cs b/mzLib/Test/Transcriptomics/TestNucleicAcid.cs new file mode 100644 index 000000000..efbf05020 --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestNucleicAcid.cs @@ -0,0 +1,171 @@ +using NUnit.Framework.Legacy; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using Chemistry; +using Omics.Fragmentation; +using Transcriptomics; +using UsefulProteomicsDatabases; + +namespace Test.Transcriptomics +{ + /// + /// Test Data generated with http://rna.rega.kuleuven.be/masspec/mongo.htm + /// + [TestFixture] + [ExcludeFromCodeCoverage] + internal class TestNucleicAcid + { + internal record SixmerTestCase(string Sequence, ProductType Type, double[] NeutralMasses, string[] ChemicalFormulas); + + internal static IEnumerable GetSixmerIndividualFragmentTypeTestCases() + { + Loaders.LoadElements(); + + yield return new SixmerTestCase("GUACUG", ProductType.a, + new[] { 267.089, 573.114, 902.167, 1207.208, 1513.233 }, + new[] { "C10H13N5O4", "C19H24N7O12P", "C29H36N12O18P2", "C38H48N15O25P3", "C47H59N17O33P4" }); + yield return new SixmerTestCase("GUACUG", ProductType.b, + new[] { 283.084, 589.109, 918.162, 1223.203, 1529.228 }, + new[] { "C10H13N5O5", "C19H24N7O13P", "C29H36N12O19P2", "C38H48N15O26P3", "C47H59N17O34P4" }); + yield return new SixmerTestCase("GUACUG", ProductType.c, + new[] { 347.055, 653.081, 982.133, 1287.174, 1593.2 }, + new[] { "C10H14N5O7P", "C19H25N7O15P2", "C29H37N12O21P3", "C38H49N15O28P4", "C47H60N17O36P5", }); + yield return new SixmerTestCase("GUACUG", ProductType.d, + new[] { 363.05, 669.075, 998.128, 1303.169, 1609.195 }, + new[] { "C10H14N5O8P", "C19H25N7O16P2", "C29H37N12O22P3", "C38H49N15O29P4", "C47H60N17O37P5", }); + yield return new SixmerTestCase("GUACUG", ProductType.dWaterLoss, + new[] { 345.039, 651.064, 980.116, 1285.157, 1591.184 }, + new[] { "C10H12N5O7P", "C19H23N7O15P2", "C29H35N12O21P3", "C38H47N15O28P4", "C47H58N17O36P5", }); + yield return new SixmerTestCase("GUACUG", ProductType.w, + new[] { 363.049, 669.074, 974.115, 1303.169, 1609.195 }, + new[] { "C10H14N5O8P", "C19H25N7O16P2", "C28H37N10O23P3", "C38H49N15O29P4", "C47H60N17O37P5", }); + yield return new SixmerTestCase("GUACUG", ProductType.x, + new[] { 347.055, 653.081, 958.122, 1287.174, 1593.2 }, + new[] { "C10H14N5O7P", "C19H25N7O15P2", "C28H37N10O22P3", "C38H49N15O28P4", "C47H60N17O36P5" }); + yield return new SixmerTestCase("GUACUG", ProductType.y, + new[] { 283.084, 589.109, 894.15, 1223.203, 1529.228 }, + new[] { "C10H13N5O5", "C19H24N7O13P", "C28H36N10O20P2", "C38H48N15O26P3", "C47H59N17O34P4", }); + yield return new SixmerTestCase("GUACUG", ProductType.z, + new[] { 267.089, 573.124, 878.156, 1207.208, 1513.233 }, + new[] { "C10H13N5O4", "C19H24N7O12P", "C28H36N10O19P2", "C38H48N15O25P3", "C47H59N17O33P4", }); + + + yield return new SixmerTestCase("GUACUG", ProductType.aBaseLoss, + new[] { 114.03, 459.07, 765.095, 1094.147, 1399.198 }, + new[] { "C5H6O3", "C15H18N5O10P", "C24H29N7O18P2", "C34H41N12O24P3", "C43H53N15O31P4" }); + yield return new SixmerTestCase("GUACUG", ProductType.bBaseLoss, + new[] { 130.027, 475.074, 781.099, 1110.152, 1415.193 }, + new[] { "C5H6O4", "C15H18N5O11P", "C24H29N7O19P2", "C34H41N12O25P3", "C43H53N15O32P4" }); + yield return new SixmerTestCase("GUACUG", ProductType.cBaseLoss, + new[] { 193.998, 539.045, 845.071, 1174.123, 1479.164 }, + new[] { "C5H7O6P", "C15H19N5O13P2", "C24H30N7O21P3", "C34H42N12O27P4", "C43H54N15O34P5" }); + yield return new SixmerTestCase("GUACUG", ProductType.dBaseLoss, + new[] { 209.993, 555.04, 861.066, 1190.118, 1495.16 }, + new[] { "C5H7O7P", "C15H19N5O14P2", "C24H30N7O22P3", "C34H42N12O28P4", "C43H54N15O35P5" }); + + // TODO: Add water loss besides d-H2O + } + + + [Test] + [TestCase("GUACUG", 1874.281)] + [TestCase("A", 267.096)] + [TestCase("C", 243.085)] + [TestCase("U", 244.069)] + [TestCase("G", 283.091)] + [TestCase("GU", 589.116)] + [TestCase("AAA", 925.200)] + [TestCase("CCC", 853.166)] + [TestCase("UUU", 856.119)] + [TestCase("GGG", 973.185)] + public void TestConstructorsAndEquality(string sequence, double monoMass) + { + // test constructors and equality + RNA rna = new RNA(sequence); + + Assert.That(rna.Length, Is.EqualTo(sequence.Length)); + Assert.That(rna.MonoisotopicMass, Is.EqualTo(monoMass).Within(0.01)); + Assert.That(rna.GetChemicalFormula().MonoisotopicMass, Is.EqualTo(monoMass).Within(0.01)); + Assert.That(rna.NucleicAcidArray.Length, Is.EqualTo(sequence.Length)); + CollectionAssert.AreEqual(rna.NucleicAcidArray.Select(p => p.Letter), sequence); + Assert.That(rna.FivePrimeTerminus.Equals(NucleicAcid.DefaultFivePrimeTerminus)); + Assert.That(rna.ThreePrimeTerminus.Equals(NucleicAcid.DefaultThreePrimeTerminus)); + List nucList = new(); + foreach (var nucleotide in sequence) + { + nucList.Add(Nucleotide.GetResidue(nucleotide)); + } + Assert.That(rna.NucleicAcidArray.SequenceEqual(nucList.ToArray())); + + var rna2 = new RNA(sequence, NucleicAcid.DefaultFivePrimeTerminus, NucleicAcid.DefaultThreePrimeTerminus); + + Assert.That(rna2.Length, Is.EqualTo(sequence.Length)); + Assert.That(rna2.MonoisotopicMass, Is.EqualTo(monoMass).Within(0.01)); + Assert.That(rna.FivePrimeTerminus.Equals(NucleicAcid.DefaultFivePrimeTerminus)); + Assert.That(rna.ThreePrimeTerminus.Equals(NucleicAcid.DefaultThreePrimeTerminus)); + nucList.Clear(); + foreach (var nucleotide in sequence) + { + nucList.Add(Nucleotide.GetResidue(nucleotide)); + } + Assert.That(rna.NucleicAcidArray.SequenceEqual(nucList.ToArray())); + + Assert.That(rna.Equals(rna2)); + Assert.That(rna.Equals(rna)); + Assert.That(!rna.Equals(null)); + Assert.That(rna.Equals((object)rna2)); + Assert.That(rna.Equals((object)rna)); + Assert.That(!rna.Equals((object)null)); + Assert.That(!rna.Equals((object)new Double())); + } + + [Test] + public void TestParseSequence() + { + var rna1 = new RNA("GUACUG"); + var rna2 = new RNA("GU ACU G"); + var rna3 = new RNA("GU*ACU*G"); + + Assert.That(rna1.BaseSequence, Is.EqualTo(rna2.BaseSequence)); + Assert.That(rna1.BaseSequence, Is.EqualTo(rna3.BaseSequence)); + Assert.That(rna1.GetHashCode(), Is.EqualTo(rna3.GetHashCode())); + Assert.That(rna1.GetHashCode(), Is.EqualTo(rna3.GetHashCode())); + Assert.That(rna1.Length, Is.EqualTo(rna3.Length)); + Assert.That(rna1.Length, Is.EqualTo(rna3.Length)); + + Assert.Throws(() => new RNA("GUA~CUG")); + } + + [Test] + [TestCase("GUACUG", new[] { -1, -2, -3, -4, -5 }, new[] { 1873.273, 936.133, 623.752, 467.562, 373.848 })] + public void TestElectroSpraySeries(string sequence, int[] charges, double[] mzs) + { + RNA rna = new(sequence); + + int i = 0; + foreach (var ion in rna.GetElectrospraySeries(charges.First(), charges.Last())) + { + Assert.That(ion, Is.EqualTo(mzs[i]).Within(0.001)); + i++; + } + } + + [Test] + [TestCase("GUACUG", new[] { -1, -2, -3, -4, -5, -6 }, new[] { 1953.239, 976.116, 650.408, 487.554, 389.841, 324.700 })] + public void TestReplaceTerminusWithElectroSpraySeries(string sequence, int[] charges, double[] mzs) + { + RNA rna = new("GUACUG"); + rna.FivePrimeTerminus = new ChemicalFormula(); + + int i = 0; + foreach (var ion in rna.GetElectrospraySeries(charges.First(), charges.Last())) + { + Assert.That(ion, Is.EqualTo(mzs[i]).Within(0.001)); + i++; + } + } + } +} diff --git a/mzLib/Test/Transcriptomics/TestProductType.cs b/mzLib/Test/Transcriptomics/TestProductType.cs new file mode 100644 index 000000000..be9dc2f93 --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestProductType.cs @@ -0,0 +1,280 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Chemistry; +using MassSpectrometry; +using NUnit.Framework; +using NUnit.Framework.Legacy; +using Omics.Fragmentation; +using Omics.Fragmentation.Oligo; +using Omics.Modifications; +using Transcriptomics; +using Transcriptomics.Digestion; + +namespace Test.Transcriptomics +{ + [TestFixture] + [ExcludeFromCodeCoverage] + public class TestProductType + { + [Test] + [TestCase(DissociationType.HCD, new[] { ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, + ProductType.dWaterLoss, ProductType.w, ProductType.x, ProductType.y, ProductType.z, ProductType.M })] + [TestCase(DissociationType.CID, new[] { ProductType.a, ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, + ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M })] + public void TestProductTypes_Dissociation(DissociationType dissociation, ProductType[] products) + { + CollectionAssert.AreEquivalent(products, dissociation.GetRnaProductTypesFromDissociationType()); + } + + [Test] + [TestCase(FragmentationTerminus.FivePrime, new[] + { + ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, + ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, + ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, + ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, + })] + [TestCase(FragmentationTerminus.ThreePrime, new[] + { + ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, + ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, + ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, + ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, + })] + public void TestProductTypes_Terminus(FragmentationTerminus terminus, ProductType[] products) + { + CollectionAssert.AreEquivalent(products, terminus.GetRnaTerminusSpecificProductTypes()); + } + + [Test] + [TestCase(DissociationType.HCD, FragmentationTerminus.FivePrime, new[] + { ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, ProductType.dWaterLoss, })] + [TestCase(DissociationType.HCD, FragmentationTerminus.ThreePrime, new[] + { ProductType.w, ProductType.x, ProductType.y, ProductType.z, })] + [TestCase(DissociationType.HCD, FragmentationTerminus.Both, new[] + { ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, ProductType.dWaterLoss, ProductType.w, ProductType.x, ProductType.y, ProductType.z, ProductType.M })] + [TestCase(DissociationType.CID, FragmentationTerminus.FivePrime, new[] + { ProductType.a, ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss })] + [TestCase(DissociationType.CID, FragmentationTerminus.ThreePrime, new[] + { ProductType.w, ProductType.y, ProductType.yWaterLoss })] + [TestCase(DissociationType.CID, FragmentationTerminus.Both, new[] + { ProductType.a, ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M })] + public void TestProductTypes_TerminusAndDissociation(DissociationType dissociation, FragmentationTerminus terminus, ProductType[] products) + { + CollectionAssert.AreEquivalent(products, dissociation.GetRnaTerminusSpecificProductTypesFromDissociation(terminus)); + } + + [Test] + public static void Test_NeutralMassShiftFromProductType() + { + foreach (ProductType p in Enum.GetValues(typeof(ProductType))) + { + double mass = 0; + switch (p) + { + case ProductType.a: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.b: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("OH").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.c: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O3H2P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.x: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-1H").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.y: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-3P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.zWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-5H-2P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.aWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H-1O-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.aBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H-2").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.bBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O1H-2").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.cWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O2P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.cBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O3H-1P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.d: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O4H2P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.dWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O3P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.dBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O4H-1P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + + case ProductType.w: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.wWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H-1O-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.xWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-2H-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.yWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-4H-2P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.z: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-4P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.wBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H-2").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + case ProductType.xBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-1H-2").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + case ProductType.yBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-3H-2P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + case ProductType.zBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-4H-3P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + } + } + } + + [Test] + public void TestProductTypes_GetRnaTerminusType() + { + foreach (var type in Enum.GetValues()) + { + switch (type) + { + case ProductType.a: + case ProductType.aWaterLoss: + case ProductType.aBaseLoss: + case ProductType.b: + case ProductType.bWaterLoss: + case ProductType.bBaseLoss: + case ProductType.c: + case ProductType.cWaterLoss: + case ProductType.cBaseLoss: + case ProductType.d: + case ProductType.dWaterLoss: + case ProductType.dBaseLoss: + Assert.That(type.GetRnaTerminusType(), Is.EqualTo(FragmentationTerminus.FivePrime)); + break; + + case ProductType.w: + case ProductType.wWaterLoss: + case ProductType.wBaseLoss: + case ProductType.x: + case ProductType.xWaterLoss: + case ProductType.xBaseLoss: + case ProductType.y: + case ProductType.yWaterLoss: + case ProductType.yBaseLoss: + case ProductType.z: + case ProductType.zWaterLoss: + case ProductType.zBaseLoss: + Assert.That(type.GetRnaTerminusType(), Is.EqualTo(FragmentationTerminus.ThreePrime)); + break; + + case ProductType.M: + Assert.That(type.GetRnaTerminusType(), Is.EqualTo(FragmentationTerminus.None)); + break; + + case ProductType.aStar: + case ProductType.bAmmoniaLoss: + case ProductType.D: + case ProductType.Ycore: + case ProductType.Y: + case ProductType.aDegree: + case ProductType.yAmmoniaLoss: + case ProductType.zPlusOne: + case ProductType.zDot: + Assert.Throws(() => type.GetRnaTerminusType()); + break; + default: + throw new ArgumentOutOfRangeException(); + } + } + } + + [Test] + [TestCase(ProductType.a, ProductType.aWaterLoss)] + [TestCase(ProductType.b, ProductType.bWaterLoss)] + [TestCase(ProductType.c, ProductType.cWaterLoss)] + [TestCase(ProductType.d, ProductType.dWaterLoss)] + [TestCase(ProductType.w, ProductType.wWaterLoss)] + [TestCase(ProductType.x, ProductType.xWaterLoss)] + [TestCase(ProductType.y, ProductType.yWaterLoss)] + [TestCase(ProductType.z, ProductType.zWaterLoss)] + public void EnsureWaterLossMassesAreCorrect(ProductType normal, ProductType waterLoss) + { + var rna = new RNA("GUACUG") + .Digest(new RnaDigestionParams(), new List(), new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + + List normalFragments = rna.GetNeutralFragments(normal).ToList(); + List waterLossFragments = rna.GetNeutralFragments(waterLoss).ToList(); + for (var index = 0; index < waterLossFragments.Count; index++) + { + var waterLossFragment = waterLossFragments[index]; + var normalFragment = normalFragments[index]; + var watermass = 2 * Constants.ProtonMass + PeriodicTable.GetElement("O").PrincipalIsotope.AtomicMass; + + Assert.That(normalFragment.MonoisotopicMass, Is.EqualTo(waterLossFragment.MonoisotopicMass + watermass).Within(0.01)); + } + } + } +} diff --git a/mzLib/Test/Transcriptomics/TestRnase.cs b/mzLib/Test/Transcriptomics/TestRnase.cs index db7d3e3dc..e72c12e11 100644 --- a/mzLib/Test/Transcriptomics/TestRnase.cs +++ b/mzLib/Test/Transcriptomics/TestRnase.cs @@ -1,5 +1,4 @@ using NUnit.Framework; -using Assert = NUnit.Framework.Legacy.ClassicAssert; using System.Diagnostics.CodeAnalysis; using System.IO; using Proteomics.ProteolyticDigestion; @@ -16,7 +15,7 @@ internal class TestRnase public void TestRnaseDictionaryLoading() { var rnaseCountFromTsv = File.ReadAllLines(rnaseTsvpath).Length - 1; - Assert.AreEqual(RnaseDictionary.Dictionary.Count, rnaseCountFromTsv); + Assert.That(RnaseDictionary.Dictionary.Count, Is.EqualTo(rnaseCountFromTsv)); } [Test] diff --git a/mzLib/Transcriptomics/NucleicAcid.cs b/mzLib/Transcriptomics/NucleicAcid.cs index ef6b74cf9..5a42b7d67 100644 --- a/mzLib/Transcriptomics/NucleicAcid.cs +++ b/mzLib/Transcriptomics/NucleicAcid.cs @@ -16,7 +16,6 @@ namespace Transcriptomics /// public abstract class NucleicAcid : INucleicAcid, IBioPolymer, IEquatable { - #region Static Properties /// @@ -43,21 +42,40 @@ public abstract class NucleicAcid : INucleicAcid, IBioPolymer, IEquatable + /// For creating an RNA programatically + /// + /// + /// + /// + /// protected NucleicAcid(string sequence, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null, IDictionary>? oneBasedPossibleLocalizedModifications = null) { MonoisotopicMass = 0; - Length = sequence.Length; - _nucleicAcids = new Nucleotide[Length]; + _nucleicAcids = new Nucleotide[sequence.Length]; ThreePrimeTerminus = threePrimeTerm ??= DefaultThreePrimeTerminus; FivePrimeTerminus = fivePrimeTerm ??= DefaultFivePrimeTerminus; _oneBasedPossibleLocalizedModifications = oneBasedPossibleLocalizedModifications ?? new Dictionary>(); GeneNames = new List>(); - ParseSequence(sequence); } + /// + /// For Reading in from rna database + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// protected NucleicAcid(string sequence, string name, string identifier, string organism, string databaseFilePath, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null, IDictionary>? oneBasedPossibleLocalizedModifications = null, @@ -126,10 +144,8 @@ public IHasChemicalFormula ThreePrimeTerminus /// /// Gets the number of nucleic acids in this nucleic acid polymer /// - public int Length { get; private set; } - + public int Length => BaseSequence.Length; - // TODO: These interface members public string Name { get; } public string FullName => Name; // TODO: Consider if this needs to be different from the name public string DatabaseFilePath { get; } @@ -257,7 +273,7 @@ public ChemicalFormula GetChemicalFormula() #region Private Methods - bool ReplaceTerminus(ref IHasChemicalFormula terminus, IHasChemicalFormula value) + private bool ReplaceTerminus(ref IHasChemicalFormula terminus, IHasChemicalFormula value) { if (Equals(value, terminus)) return false; @@ -319,7 +335,6 @@ private bool ParseSequence(string sequence) } _sequence = sb.ToString(); - Length = index; MonoisotopicMass += monoMass; Array.Resize(ref _nucleicAcids, Length); From a09d90a20daf48ffde25817223e837ed96d7f407 Mon Sep 17 00:00:00 2001 From: nbollis Date: Thu, 19 Sep 2024 17:49:43 -0500 Subject: [PATCH 03/17] Made initial tests pass --- mzLib/Test/Transcriptomics/TestDigestion.cs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mzLib/Test/Transcriptomics/TestDigestion.cs b/mzLib/Test/Transcriptomics/TestDigestion.cs index 6b385be0f..fe315ffd6 100644 --- a/mzLib/Test/Transcriptomics/TestDigestion.cs +++ b/mzLib/Test/Transcriptomics/TestDigestion.cs @@ -3,6 +3,7 @@ using System.Diagnostics.CodeAnalysis; using System.IO; using System.Linq; +using System.Reflection.Metadata.Ecma335; using System.Security.Cryptography; using Chemistry; using MassSpectrometry; @@ -304,15 +305,15 @@ public static void TestDigestionAndFragmentation(string sequence, double monoMas List fragments = new(); digestionProduct.Fragment(DissociationType.CID, FragmentationTerminus.Both, fragments); - List<(int FragmentNumber, ProductType Type, double Mass)[]> ughh = new(); - // test that fragments are correct var fragmentsToCompare = DigestFragmentTestCases .Where(p => p.Sequence.Equals(digestionProduct.BaseSequence)).ToList(); for (var i = 0; i < fragments.Count; i++) { var fragment = fragments[i]; - var theoreticalFragment = fragmentsToCompare[i]; + var theoreticalFragment = fragmentsToCompare.FirstOrDefault(p => + p.FragmentNumber == fragment.FragmentNumber && p.Type == fragment.ProductType); + if (theoreticalFragment.Mass is 0.0 ) continue; Assert.That(fragment.MonoisotopicMass, Is.EqualTo(theoreticalFragment.Mass).Within(0.01)); Assert.That(fragment.FragmentNumber, Is.EqualTo(theoreticalFragment.FragmentNumber)); Assert.That(fragment.ProductType, Is.EqualTo(theoreticalFragment.Type)); From 4dfb542544f8e336512035d80d10b53510d9abca Mon Sep 17 00:00:00 2001 From: nbollis Date: Thu, 19 Sep 2024 18:19:31 -0500 Subject: [PATCH 04/17] Removed unnecessary namespaces --- mzLib/MzLibUtil/ClassExtensions.cs | 2 +- mzLib/Test/Transcriptomics/TestDigestion.cs | 3 --- mzLib/Test/Transcriptomics/TestFragmentation.cs | 3 --- mzLib/Test/Transcriptomics/TestProductType.cs | 2 -- mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs | 7 +------ mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs | 6 ------ mzLib/Transcriptomics/Digestion/RnaDigestionParams.cs | 6 ------ 7 files changed, 2 insertions(+), 27 deletions(-) diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index 05e5cfd1e..0129154a4 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -19,7 +19,6 @@ using System; using System.Collections.Generic; using System.Linq; -using System.Text; using System.Text.RegularExpressions; namespace MzLibUtil @@ -123,5 +122,6 @@ public static string GetPeriodTolerantFilenameWithoutExtension(this string fileP { return PeriodTolerantFilenameWithoutExtension.GetPeriodTolerantFilenameWithoutExtension(filePath); } + } } \ No newline at end of file diff --git a/mzLib/Test/Transcriptomics/TestDigestion.cs b/mzLib/Test/Transcriptomics/TestDigestion.cs index fe315ffd6..bf31392ca 100644 --- a/mzLib/Test/Transcriptomics/TestDigestion.cs +++ b/mzLib/Test/Transcriptomics/TestDigestion.cs @@ -3,11 +3,8 @@ using System.Diagnostics.CodeAnalysis; using System.IO; using System.Linq; -using System.Reflection.Metadata.Ecma335; -using System.Security.Cryptography; using Chemistry; using MassSpectrometry; -using MathNet.Numerics.Distributions; using NUnit.Framework; using Omics.Digestion; using Omics.Fragmentation; diff --git a/mzLib/Test/Transcriptomics/TestFragmentation.cs b/mzLib/Test/Transcriptomics/TestFragmentation.cs index 6086ecb70..8f7bb2d78 100644 --- a/mzLib/Test/Transcriptomics/TestFragmentation.cs +++ b/mzLib/Test/Transcriptomics/TestFragmentation.cs @@ -3,9 +3,6 @@ using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; using System.Linq; -using System.Runtime.CompilerServices; -using System.Text; -using System.Threading.Tasks; using Transcriptomics; using MassSpectrometry; using Omics.Fragmentation; diff --git a/mzLib/Test/Transcriptomics/TestProductType.cs b/mzLib/Test/Transcriptomics/TestProductType.cs index be9dc2f93..f9c459211 100644 --- a/mzLib/Test/Transcriptomics/TestProductType.cs +++ b/mzLib/Test/Transcriptomics/TestProductType.cs @@ -2,8 +2,6 @@ using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; using System.Linq; -using System.Text; -using System.Threading.Tasks; using Chemistry; using MassSpectrometry; using NUnit.Framework; diff --git a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs index a741638c5..3a7382e7c 100644 --- a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs +++ b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs @@ -1,9 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using Chemistry; +using Chemistry; using Omics.Digestion; using Omics.Modifications; diff --git a/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs index 92b5e501c..966f97c50 100644 --- a/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs +++ b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs @@ -4,12 +4,6 @@ using Omics.Fragmentation; using Omics.Modifications; using Omics; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Security.Cryptography; -using System.Text; -using System.Threading.Tasks; using Easy.Common.Extensions; using Omics.Fragmentation.Oligo; diff --git a/mzLib/Transcriptomics/Digestion/RnaDigestionParams.cs b/mzLib/Transcriptomics/Digestion/RnaDigestionParams.cs index 379e48fa9..fb80a1a0b 100644 --- a/mzLib/Transcriptomics/Digestion/RnaDigestionParams.cs +++ b/mzLib/Transcriptomics/Digestion/RnaDigestionParams.cs @@ -1,10 +1,5 @@ using Omics.Digestion; using Omics.Fragmentation; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; namespace Transcriptomics.Digestion { @@ -12,7 +7,6 @@ public class RnaDigestionParams : IDigestionParams { // this parameterless constructor needs to exist to read the toml. - // if you can figure out a way to get rid of it, feel free... public RnaDigestionParams() : this("top-down") { } From 2f6b6cb975679c1e9856cec28a26914079b6c312 Mon Sep 17 00:00:00 2001 From: nbollis Date: Thu, 19 Sep 2024 19:17:43 -0500 Subject: [PATCH 05/17] Expanded test coverage --- mzLib/Test/Transcriptomics/TestDigestion.cs | 26 +++++++++++++++++++ .../Test/Transcriptomics/TestFragmentation.cs | 4 +-- mzLib/Test/Transcriptomics/TestNucleicAcid.cs | 6 ++--- mzLib/Test/Transcriptomics/TestNucleotide.cs | 6 ++--- mzLib/Test/Transcriptomics/TestRnase.cs | 2 +- 5 files changed, 35 insertions(+), 9 deletions(-) diff --git a/mzLib/Test/Transcriptomics/TestDigestion.cs b/mzLib/Test/Transcriptomics/TestDigestion.cs index bf31392ca..d210b6a2e 100644 --- a/mzLib/Test/Transcriptomics/TestDigestion.cs +++ b/mzLib/Test/Transcriptomics/TestDigestion.cs @@ -555,6 +555,32 @@ public void TestDigestionParams_Properties(RnaDigestionTestCase testCase) Assert.That(digestionProducts.Count(), Is.EqualTo(testCase.DigestionProductCount)); } + [Test] + public void TestDigestionParamsClone() + { + var digestionParams = new RnaDigestionParams("top-down", 0, 3, 20000); + var cloned = digestionParams.Clone(FragmentationTerminus.C); + + // set new terminus, all values except terminus are retained + Assert.That(digestionParams.DigestionAgent, Is.EqualTo(cloned.DigestionAgent)); + Assert.That(digestionParams.MaxMissedCleavages, Is.EqualTo(cloned.MaxMissedCleavages)); + Assert.That(digestionParams.MinLength, Is.EqualTo(cloned.MinLength)); + Assert.That(digestionParams.MaxLength, Is.EqualTo(cloned.MaxLength)); + Assert.That(digestionParams.MaxMods, Is.EqualTo(cloned.MaxMods)); + Assert.That(digestionParams.FragmentationTerminus, Is.Not.EqualTo(cloned.FragmentationTerminus)); + Assert.That(cloned.FragmentationTerminus, Is.EqualTo(FragmentationTerminus.C)); + + // do not set new terminus, all values are retained + cloned = digestionParams.Clone(); + Assert.That(digestionParams.DigestionAgent, Is.EqualTo(cloned.DigestionAgent)); + Assert.That(digestionParams.MaxMissedCleavages, Is.EqualTo(cloned.MaxMissedCleavages)); + Assert.That(digestionParams.MinLength, Is.EqualTo(cloned.MinLength)); + Assert.That(digestionParams.MaxLength, Is.EqualTo(cloned.MaxLength)); + Assert.That(digestionParams.MaxMods, Is.EqualTo(cloned.MaxMods)); + Assert.That(digestionParams.FragmentationTerminus, Is.EqualTo(cloned.FragmentationTerminus)); + Assert.That(cloned.FragmentationTerminus, Is.EqualTo(FragmentationTerminus.Both)); + } + #endregion #region NucleicAcid diff --git a/mzLib/Test/Transcriptomics/TestFragmentation.cs b/mzLib/Test/Transcriptomics/TestFragmentation.cs index 8f7bb2d78..fea764246 100644 --- a/mzLib/Test/Transcriptomics/TestFragmentation.cs +++ b/mzLib/Test/Transcriptomics/TestFragmentation.cs @@ -15,10 +15,10 @@ namespace Test.Transcriptomics { [TestFixture] [ExcludeFromCodeCoverage] - internal class TestFragmentation + public class TestFragmentation { - internal static IEnumerable GetSixMerIndividualFragmentTypeTestCases() => + public static IEnumerable GetSixMerIndividualFragmentTypeTestCases() => TestNucleicAcid.GetSixmerIndividualFragmentTypeTestCases(); [Test] diff --git a/mzLib/Test/Transcriptomics/TestNucleicAcid.cs b/mzLib/Test/Transcriptomics/TestNucleicAcid.cs index efbf05020..a0c5619c9 100644 --- a/mzLib/Test/Transcriptomics/TestNucleicAcid.cs +++ b/mzLib/Test/Transcriptomics/TestNucleicAcid.cs @@ -16,11 +16,11 @@ namespace Test.Transcriptomics /// [TestFixture] [ExcludeFromCodeCoverage] - internal class TestNucleicAcid + public class TestNucleicAcid { - internal record SixmerTestCase(string Sequence, ProductType Type, double[] NeutralMasses, string[] ChemicalFormulas); + public record SixmerTestCase(string Sequence, ProductType Type, double[] NeutralMasses, string[] ChemicalFormulas); - internal static IEnumerable GetSixmerIndividualFragmentTypeTestCases() + public static IEnumerable GetSixmerIndividualFragmentTypeTestCases() { Loaders.LoadElements(); diff --git a/mzLib/Test/Transcriptomics/TestNucleotide.cs b/mzLib/Test/Transcriptomics/TestNucleotide.cs index df250fd40..277ebc3d6 100644 --- a/mzLib/Test/Transcriptomics/TestNucleotide.cs +++ b/mzLib/Test/Transcriptomics/TestNucleotide.cs @@ -9,12 +9,12 @@ namespace Test.Transcriptomics { [ExcludeFromCodeCoverage] - internal class TestNucleotide + public class TestNucleotide { - internal record NucleotideTestCase(Nucleotide Nucleotide, string Name, char OneLetterCode, string Symbol, ChemicalFormula Formula, double Mass, + public record NucleotideTestCase(Nucleotide Nucleotide, string Name, char OneLetterCode, string Symbol, ChemicalFormula Formula, double Mass, ChemicalFormula nucleosideFormula); - internal static IEnumerable GetNucleotideTestCases() + public static IEnumerable GetNucleotideTestCases() { Loaders.LoadElements(); diff --git a/mzLib/Test/Transcriptomics/TestRnase.cs b/mzLib/Test/Transcriptomics/TestRnase.cs index e72c12e11..b122f32bd 100644 --- a/mzLib/Test/Transcriptomics/TestRnase.cs +++ b/mzLib/Test/Transcriptomics/TestRnase.cs @@ -7,7 +7,7 @@ namespace Test.Transcriptomics { [ExcludeFromCodeCoverage] - internal class TestRnase + public class TestRnase { public static string rnaseTsvpath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"Digestion\rnases.tsv"); From c9041b09684b53899d37e676689ca07daffa285a Mon Sep 17 00:00:00 2001 From: Nic Bollis Date: Tue, 24 Sep 2024 14:30:48 -0500 Subject: [PATCH 06/17] Responded to Alex Comments --- .../Oligo/DissociationTypeCollection.cs | 262 +++++++++++++++++- .../Digestion/NucleolyticOligo.cs | 16 +- .../Digestion/OligoWithSetMods.cs | 54 ++-- mzLib/Transcriptomics/NucleicAcid.cs | 43 +-- 4 files changed, 313 insertions(+), 62 deletions(-) diff --git a/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs b/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs index 3bc08d089..7b5a411ee 100644 --- a/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs +++ b/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs @@ -1,4 +1,258 @@ -using Chemistry; using MassSpectrometry; namespace Omics.Fragmentation.Oligo { /// /// Methods dealing with specific product type for RNA molecules /// public static class DissociationTypeCollection { - /// /// Product Ion types by dissociation method /// /// /// HCD ions were taken from the following paper: https://www.nature.com/articles/s41598-023-36193-2 /// Ion types below here should be validated with experimental results. /// Base and water losses occur very frequently and may also be present in these activation types. /// CID, UVPD, and aEPD ions were taken from the following paper: https://pubs.acs.org/doi/10.1021/acs.analchem.3c05428?ref=PDF /// NETD ions were taken from the following paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7161943/ /// lowCID ions were taken from this Thermo Poster: https://assets.thermofisher.com/TFS-Assets/CMD/Flyers/fl-489263-asms23-optimized-fragmentation-oligonucleotides-suppresses-undesired-fragmentation-fl489263-en.pdf /// public static Dictionary> ProductsFromDissociationType = new Dictionary>() { { DissociationType.Unknown, new List() }, { DissociationType.Custom, new List() }, { DissociationType.AnyActivationType, new List { ProductType.a, ProductType.aBaseLoss, ProductType.aWaterLoss, ProductType.b, ProductType.bBaseLoss, ProductType.bWaterLoss, ProductType.c, ProductType.cBaseLoss, ProductType.cWaterLoss, ProductType.d, ProductType.dBaseLoss, ProductType.dWaterLoss, ProductType.w, ProductType.wBaseLoss, ProductType.wWaterLoss, ProductType.x, ProductType.xBaseLoss, ProductType.xWaterLoss, ProductType.y, ProductType.yBaseLoss, ProductType.yWaterLoss, ProductType.z, ProductType.zBaseLoss, ProductType.zWaterLoss, ProductType.M } }, { DissociationType.CID, new List { ProductType.a, ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M } }, { DissociationType.HCD, new List { ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, ProductType.dWaterLoss, ProductType.w, ProductType.x, ProductType.y, ProductType.z, ProductType.M } }, { DissociationType.UVPD, new List { ProductType.a, ProductType.c, ProductType.d, ProductType.w, ProductType.M } }, { DissociationType.aEPD, new List { ProductType.a, ProductType.c, ProductType.d, ProductType.w, ProductType.x, ProductType.z, ProductType.M } }, { DissociationType.NETD, new List { ProductType.w, ProductType.d, ProductType.M } }, { DissociationType.LowCID, new List() { ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M } }, { DissociationType.IRMPD, new List() { } }, { DissociationType.ECD, new List { } }, { DissociationType.PQD, new List { } }, { DissociationType.ETD, new List { } }, { DissociationType.EThcD, new List { } }, }; - - /// /// Returns all dissociation types with implemented product type collections /// public static IEnumerable AllImplementedDissociationTypes => ProductsFromDissociationType.Where(p => p.Value.Any()) .Select(p => p.Key); /// /// Returns list of products types based upon the dissociation type /// /// /// public static List GetRnaProductTypesFromDissociationType(this DissociationType dissociationType) => ProductsFromDissociationType[dissociationType]; /// /// Mass to be added or subtracted /// private static readonly Dictionary FragmentIonCaps = new Dictionary { { ProductType.a, ChemicalFormula.ParseFormula("H") }, { ProductType.aWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, { ProductType.b, ChemicalFormula.ParseFormula("OH") }, { ProductType.bWaterLoss, ChemicalFormula.ParseFormula("H-1") }, { ProductType.c, ChemicalFormula.ParseFormula("O3H2P") }, { ProductType.cWaterLoss, ChemicalFormula.ParseFormula("O2P") }, { ProductType.d, ChemicalFormula.ParseFormula("O4H2P") }, { ProductType.dWaterLoss, ChemicalFormula.ParseFormula("O3P") }, { ProductType.w, ChemicalFormula.ParseFormula("H") }, { ProductType.wWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, { ProductType.x, ChemicalFormula.ParseFormula("O-1H") }, { ProductType.xWaterLoss, ChemicalFormula.ParseFormula("O-2H-1") }, { ProductType.y, ChemicalFormula.ParseFormula("O-3P-1") }, { ProductType.yWaterLoss, ChemicalFormula.ParseFormula("O-4H-2P-1") }, { ProductType.z, ChemicalFormula.ParseFormula("O-4P-1") }, { ProductType.zWaterLoss, ChemicalFormula.ParseFormula("O-5H-2P-1") }, //fragment - Base chemical formula is the corresponding fragment chemical formula subtracing 1 H as H is lost when base is removed { ProductType.aBaseLoss, ChemicalFormula.ParseFormula("H-2") }, // "H-1" -H { ProductType.bBaseLoss, ChemicalFormula.ParseFormula("O1H-2") }, //"OH1" -H { ProductType.cBaseLoss, ChemicalFormula.ParseFormula("O3H-1P") }, //"O3P" -H { ProductType.dBaseLoss, ChemicalFormula.ParseFormula("O4H-1P") }, //"O4H2P" -H { ProductType.wBaseLoss, ChemicalFormula.ParseFormula("H-2") }, //"H"-H { ProductType.xBaseLoss, ChemicalFormula.ParseFormula("O-1H-2") }, //"O-1H" -H { ProductType.yBaseLoss, ChemicalFormula.ParseFormula("O-3H-2P-1") }, //"O-3P-1" -H { ProductType.zBaseLoss, ChemicalFormula.ParseFormula("O-4H-3P-1") }, //"O-4H-1P-1" -1 { ProductType.M, new ChemicalFormula() } }; /// /// Returns mass shift by product type /// /// /// public static double GetRnaMassShiftFromProductType(this ProductType type) => FragmentIonCaps[type].MonoisotopicMass; public static FragmentationTerminus GetRnaTerminusType(this ProductType fragmentType) { switch (fragmentType) { case ProductType.a: case ProductType.aWaterLoss: case ProductType.aBaseLoss: case ProductType.b: case ProductType.bWaterLoss: case ProductType.bBaseLoss: case ProductType.c: case ProductType.cWaterLoss: case ProductType.cBaseLoss: case ProductType.d: case ProductType.dWaterLoss: case ProductType.dBaseLoss: return FragmentationTerminus.FivePrime; case ProductType.w: case ProductType.wWaterLoss: case ProductType.wBaseLoss: case ProductType.x: case ProductType.xWaterLoss: case ProductType.xBaseLoss: case ProductType.y: case ProductType.yWaterLoss: case ProductType.yBaseLoss: case ProductType.z: case ProductType.zWaterLoss: case ProductType.zBaseLoss: return FragmentationTerminus.ThreePrime; case ProductType.M: return FragmentationTerminus.None; case ProductType.aStar: case ProductType.aDegree: case ProductType.bAmmoniaLoss: case ProductType.yAmmoniaLoss: case ProductType.zPlusOne: case ProductType.D: case ProductType.Ycore: case ProductType.Y: default: throw new ArgumentOutOfRangeException(nameof(fragmentType), fragmentType, null); } } /// /// Product ion types by Fragmentation Terminus /// private static readonly Dictionary> ProductIonTypesFromSpecifiedTerminus = new Dictionary> { { FragmentationTerminus.FivePrime, new List { ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, } }, { FragmentationTerminus.ThreePrime, new List { ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, } }, { FragmentationTerminus.Both, new List { ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, ProductType.M } } }; public static List GetRnaTerminusSpecificProductTypes( this FragmentationTerminus fragmentationTerminus) { return ProductIonTypesFromSpecifiedTerminus[fragmentationTerminus]; } /// /// Returns all product ion types based upon specified terminus /// /// /// /// public static List GetRnaTerminusSpecificProductTypesFromDissociation( this DissociationType dissociationType, FragmentationTerminus fragmentationTerminus) { var terminusSpecific = fragmentationTerminus.GetRnaTerminusSpecificProductTypes(); var dissociationSpecific = dissociationType.GetRnaProductTypesFromDissociationType(); return terminusSpecific.Intersect(dissociationSpecific).ToList(); } } } \ No newline at end of file +using Chemistry; +using MassSpectrometry; + +namespace Omics.Fragmentation.Oligo +{ + /// + /// Methods dealing with specific product type for RNA molecules + /// + public static class DissociationTypeCollection + { + /// + /// Product Ion types by dissociation method + /// + /// + /// HCD ions were taken from the following paper: https://www.nature.com/articles/s41598-023-36193-2 + /// Ion types below here should be validated with experimental results. + /// Base and water losses occur very frequently and may also be present in these activation types. + /// CID, UVPD, and aEPD ions were taken from the following paper: https://pubs.acs.org/doi/10.1021/acs.analchem.3c05428?ref=PDF + /// NETD ions were taken from the following paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7161943/ + /// lowCID ions were taken from this Thermo Poster: https://assets.thermofisher.com/TFS-Assets/CMD/Flyers/fl-489263-asms23-optimized-fragmentation-oligonucleotides-suppresses-undesired-fragmentation-fl489263-en.pdf + /// + public static Dictionary> ProductsFromDissociationType = + new Dictionary>() + { + { DissociationType.Unknown, new List() }, + { DissociationType.Custom, new List() }, + { + DissociationType.AnyActivationType, new List + { + ProductType.a, ProductType.aBaseLoss, ProductType.aWaterLoss, + ProductType.b, ProductType.bBaseLoss, ProductType.bWaterLoss, + ProductType.c, ProductType.cBaseLoss, ProductType.cWaterLoss, + ProductType.d, ProductType.dBaseLoss, ProductType.dWaterLoss, + ProductType.w, ProductType.wBaseLoss, ProductType.wWaterLoss, + ProductType.x, ProductType.xBaseLoss, ProductType.xWaterLoss, + ProductType.y, ProductType.yBaseLoss, ProductType.yWaterLoss, + ProductType.z, ProductType.zBaseLoss, ProductType.zWaterLoss, + ProductType.M + } + }, + { + DissociationType.CID, new List + { + ProductType.a, ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, + ProductType.y, ProductType.yWaterLoss, ProductType.M + } + }, + { + DissociationType.HCD, new List + { + ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, + ProductType.dWaterLoss, ProductType.w, ProductType.x, ProductType.y, ProductType.z, + ProductType.M + } + }, + { + DissociationType.UVPD, new List + { + ProductType.a, ProductType.c, ProductType.d, ProductType.w, ProductType.M + } + }, + { + DissociationType.aEPD, new List + { + ProductType.a, ProductType.c, ProductType.d, ProductType.w, ProductType.x, ProductType.z, ProductType.M + } + }, + { + DissociationType.NETD, new List + { + ProductType.w, ProductType.d, ProductType.M + } + }, + { + DissociationType.LowCID, new List() + { + ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, + ProductType.y, ProductType.yWaterLoss, ProductType.M + } + }, + { DissociationType.IRMPD, new List() { } }, + { DissociationType.ECD, new List { } }, + { DissociationType.PQD, new List { } }, + { DissociationType.ETD, new List { } }, + { DissociationType.EThcD, new List { } }, + }; + + /// + /// Returns all dissociation types with implemented product type collections + /// + public static IEnumerable AllImplementedDissociationTypes => + ProductsFromDissociationType.Where(p => p.Value.Any()) + .Select(p => p.Key); + + /// + /// Returns list of products types based upon the dissociation type + /// + /// + /// + public static List GetRnaProductTypesFromDissociationType(this DissociationType dissociationType) => + ProductsFromDissociationType[dissociationType]; + + + /// + /// Mass to be added or subtracted + /// + private static readonly Dictionary FragmentIonCaps = + new Dictionary + { + { ProductType.a, ChemicalFormula.ParseFormula("H") }, + { ProductType.aWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, + { ProductType.b, ChemicalFormula.ParseFormula("OH") }, + { ProductType.bWaterLoss, ChemicalFormula.ParseFormula("H-1") }, + { ProductType.c, ChemicalFormula.ParseFormula("O3H2P") }, + { ProductType.cWaterLoss, ChemicalFormula.ParseFormula("O2P") }, + { ProductType.d, ChemicalFormula.ParseFormula("O4H2P") }, + { ProductType.dWaterLoss, ChemicalFormula.ParseFormula("O3P") }, + + { ProductType.w, ChemicalFormula.ParseFormula("H") }, + { ProductType.wWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, + { ProductType.x, ChemicalFormula.ParseFormula("O-1H") }, + { ProductType.xWaterLoss, ChemicalFormula.ParseFormula("O-2H-1") }, + { ProductType.y, ChemicalFormula.ParseFormula("O-3P-1") }, + { ProductType.yWaterLoss, ChemicalFormula.ParseFormula("O-4H-2P-1") }, + { ProductType.z, ChemicalFormula.ParseFormula("O-4P-1") }, + { ProductType.zWaterLoss, ChemicalFormula.ParseFormula("O-5H-2P-1") }, + //fragment - Base chemical formula is the corresponding fragment chemical formula subtracing 1 H as H is lost when base is removed + { ProductType.aBaseLoss, ChemicalFormula.ParseFormula("H-2") }, // "H-1" -H + { ProductType.bBaseLoss, ChemicalFormula.ParseFormula("O1H-2") }, //"OH1" -H + { ProductType.cBaseLoss, ChemicalFormula.ParseFormula("O3H-1P") }, //"O3P" -H + { ProductType.dBaseLoss, ChemicalFormula.ParseFormula("O4H-1P") }, //"O4H2P" -H + + { ProductType.wBaseLoss, ChemicalFormula.ParseFormula("H-2") }, //"H"-H + { ProductType.xBaseLoss, ChemicalFormula.ParseFormula("O-1H-2") }, //"O-1H" -H + { ProductType.yBaseLoss, ChemicalFormula.ParseFormula("O-3H-2P-1") }, //"O-3P-1" -H + { ProductType.zBaseLoss, ChemicalFormula.ParseFormula("O-4H-3P-1") }, //"O-4H-1P-1" -1 + + { ProductType.M, new ChemicalFormula() } + }; + + /// + /// Returns mass shift by product type + /// + /// + /// + public static double GetRnaMassShiftFromProductType(this ProductType type) => FragmentIonCaps[type].MonoisotopicMass; + + public static FragmentationTerminus GetRnaTerminusType(this ProductType fragmentType) + { + switch (fragmentType) + { + case ProductType.a: + case ProductType.aWaterLoss: + case ProductType.aBaseLoss: + case ProductType.b: + case ProductType.bWaterLoss: + case ProductType.bBaseLoss: + case ProductType.c: + case ProductType.cWaterLoss: + case ProductType.cBaseLoss: + case ProductType.d: + case ProductType.dWaterLoss: + case ProductType.dBaseLoss: + return FragmentationTerminus.FivePrime; + + case ProductType.w: + case ProductType.wWaterLoss: + case ProductType.wBaseLoss: + case ProductType.x: + case ProductType.xWaterLoss: + case ProductType.xBaseLoss: + case ProductType.y: + case ProductType.yWaterLoss: + case ProductType.yBaseLoss: + case ProductType.z: + case ProductType.zWaterLoss: + case ProductType.zBaseLoss: + return FragmentationTerminus.ThreePrime; + + case ProductType.M: + return FragmentationTerminus.None; + + case ProductType.aStar: + case ProductType.aDegree: + case ProductType.bAmmoniaLoss: + case ProductType.yAmmoniaLoss: + case ProductType.zPlusOne: + case ProductType.D: + case ProductType.Ycore: + case ProductType.Y: + default: + throw new ArgumentOutOfRangeException(nameof(fragmentType), fragmentType, null); + } + } + + /// + /// Product ion types by Fragmentation Terminus + /// + private static readonly Dictionary> + ProductIonTypesFromSpecifiedTerminus = new Dictionary> + { + { + FragmentationTerminus.FivePrime, new List + { + ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, + ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, + ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, + ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, + } + }, + { + FragmentationTerminus.ThreePrime, new List + { + ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, + ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, + ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, + ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, + } + }, + { + FragmentationTerminus.Both, new List + { + + ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, + ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, + ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, + ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, + ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, + ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, + ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, + ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, + ProductType.M + } + } + }; + + + public static List GetRnaTerminusSpecificProductTypes( + this FragmentationTerminus fragmentationTerminus) + { + return ProductIonTypesFromSpecifiedTerminus[fragmentationTerminus]; + } + + /// + /// Returns all product ion types based upon specified terminus + /// + /// + /// + /// + public static List GetRnaTerminusSpecificProductTypesFromDissociation( + this DissociationType dissociationType, FragmentationTerminus fragmentationTerminus) + { + var terminusSpecific = fragmentationTerminus.GetRnaTerminusSpecificProductTypes(); + var dissociationSpecific = dissociationType.GetRnaProductTypesFromDissociationType(); + return terminusSpecific.Intersect(dissociationSpecific).ToList(); + } + } +} diff --git a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs index 3a7382e7c..7f98597a4 100644 --- a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs +++ b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs @@ -32,7 +32,18 @@ public override string ToString() return BaseSequence; } - internal IEnumerable GetModifiedOligos(IEnumerable allKnownFixedMods, + /// + /// Generates a collection of oligos with set modifications based on the provided fixed and variable modifications, + /// digestion parameters, and the nucleic acid sequence. + /// + /// A collection of all known fixed modifications. + /// Parameters for RNA digestion. + /// A list of variable modifications to consider. + /// An enumerable collection of oligos with set modifications. + /// + /// Code heavily borrowed from ProteolyticPeptide.GetModifiedPeptides + /// + internal IEnumerable GenerateModifiedOligos(IEnumerable allKnownFixedMods, RnaDigestionParams digestionParams, List variableModifications) { int oligoLength = OneBasedEndResidue - OneBasedStartResidue + 1; @@ -79,7 +90,7 @@ internal IEnumerable GetModifiedOligos(IEnumerable= OneBasedStartResidue && kvp.Key <= OneBasedEndResidue; @@ -127,6 +138,7 @@ internal IEnumerable GetModifiedOligos(IEnumerable kvp in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForOligo, oligoLength)) { int numFixedMods = 0; diff --git a/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs index 966f97c50..f5e51c19c 100644 --- a/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs +++ b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs @@ -9,6 +9,17 @@ namespace Transcriptomics.Digestion { + + /// + /// Represents an oligonucleotide with set modifications, providing properties and methods for + /// accessing and manipulating its chemical characteristics. + /// + /// + /// The monoisotopic mass, most abundant mass, and chemical formula are calculated on the fly if the corresponding properties + /// (_monoisotopicMass, _thisChemicalFormula, _mostAbundantMonoisotopicMass) are null. This ensures that the most up-to-date values are + /// always available based on the current state of the oligonucleotide and its modifications. Therefor, it is important to set those + /// properties to null whenever a termini or modification is changed. + /// public class OligoWithSetMods : NucleolyticOligo, IBioPolymerWithSetMods, INucleicAcid { public OligoWithSetMods(NucleicAcid nucleicAcid, RnaDigestionParams digestionParams, int oneBaseStartResidue, @@ -83,13 +94,10 @@ public double MonoisotopicMass { get { - if (_monoisotopicMass is null) - { - _monoisotopicMass = BaseSequence.Sum(nuc => Nucleotide.GetResidue(nuc).MonoisotopicMass) + - AllModsOneIsNterminus.Values.Sum(mod => mod.MonoisotopicMass.Value) + - FivePrimeTerminus.MonoisotopicMass + - ThreePrimeTerminus.MonoisotopicMass; - } + _monoisotopicMass ??= BaseSequence.Sum(nuc => Nucleotide.GetResidue(nuc).MonoisotopicMass) + + AllModsOneIsNterminus.Values.Sum(mod => mod.MonoisotopicMass!.Value) + + FivePrimeTerminus.MonoisotopicMass + + ThreePrimeTerminus.MonoisotopicMass; return _monoisotopicMass.Value; } } @@ -98,20 +106,19 @@ public ChemicalFormula ThisChemicalFormula { get { - if (_thisChemicalFormula is null) + if (_thisChemicalFormula is not null) return _thisChemicalFormula!; + + var fullFormula = new RNA(BaseSequence, FivePrimeTerminus, ThreePrimeTerminus).GetChemicalFormula(); + foreach (var mod in AllModsOneIsNterminus.Values) { - var fullFormula = new RNA(BaseSequence, FivePrimeTerminus, ThreePrimeTerminus).GetChemicalFormula(); - foreach (var mod in AllModsOneIsNterminus.Values) + if (mod.ChemicalFormula is null) { - if (mod.ChemicalFormula is null) - { - fullFormula = null; - break; - } - fullFormula.Add(mod.ChemicalFormula); + fullFormula = null; + break; } - _thisChemicalFormula = fullFormula; + fullFormula.Add(mod.ChemicalFormula); } + _thisChemicalFormula = fullFormula; return _thisChemicalFormula!; } } @@ -120,13 +127,12 @@ public double MostAbundantMonoisotopicMass { get { - if (_mostAbundantMonoisotopicMass is null) - { - var distribution = IsotopicDistribution.GetDistribution(ThisChemicalFormula); - double maxIntensity = distribution.Intensities.Max(); - _mostAbundantMonoisotopicMass = distribution.Masses[distribution.Intensities.IndexOf(maxIntensity)].RoundedDouble(); - } - return _mostAbundantMonoisotopicMass.Value; + if (_mostAbundantMonoisotopicMass is not null) return _mostAbundantMonoisotopicMass.Value; + + var distribution = IsotopicDistribution.GetDistribution(ThisChemicalFormula); + double maxIntensity = distribution.Intensities.Max(); + _mostAbundantMonoisotopicMass = distribution.Masses[distribution.Intensities.IndexOf(maxIntensity)].RoundedDouble(); + return _mostAbundantMonoisotopicMass!.Value; } } diff --git a/mzLib/Transcriptomics/NucleicAcid.cs b/mzLib/Transcriptomics/NucleicAcid.cs index 5a42b7d67..db6f18f43 100644 --- a/mzLib/Transcriptomics/NucleicAcid.cs +++ b/mzLib/Transcriptomics/NucleicAcid.cs @@ -45,10 +45,6 @@ public abstract class NucleicAcid : INucleicAcid, IBioPolymer, IEquatable /// For creating an RNA programatically /// - /// - /// - /// - /// protected NucleicAcid(string sequence, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null, IDictionary>? oneBasedPossibleLocalizedModifications = null) { @@ -59,23 +55,12 @@ protected NucleicAcid(string sequence, IHasChemicalFormula? fivePrimeTerm = null _oneBasedPossibleLocalizedModifications = oneBasedPossibleLocalizedModifications ?? new Dictionary>(); GeneNames = new List>(); - ParseSequence(sequence); + ParseSequenceString(sequence); } /// /// For Reading in from rna database /// - /// - /// - /// - /// - /// - /// - /// - /// - /// - /// - /// protected NucleicAcid(string sequence, string name, string identifier, string organism, string databaseFilePath, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null, IDictionary>? oneBasedPossibleLocalizedModifications = null, @@ -120,7 +105,6 @@ protected NucleicAcid(string sequence, string name, string identifier, string or #endregion - #region Public Properties /// @@ -216,7 +200,7 @@ public IEnumerable Digest(IDigestionParams digestionPara digestionParams.MaxMissedCleavages, digestionParams.MinLength, digestionParams.MaxLength)) { // add fixed and variable mods to base sequence digestion products - foreach (var modifiedOligo in unmodifiedOligo.GetModifiedOligos(allKnownFixedMods, digestionParams, + foreach (var modifiedOligo in unmodifiedOligo.GenerateModifiedOligos(allKnownFixedMods, digestionParams, variableModifications)) { yield return modifiedOligo; @@ -273,10 +257,10 @@ public ChemicalFormula GetChemicalFormula() #region Private Methods - private bool ReplaceTerminus(ref IHasChemicalFormula terminus, IHasChemicalFormula value) + private void ReplaceTerminus(ref IHasChemicalFormula? terminus, IHasChemicalFormula? value) { if (Equals(value, terminus)) - return false; + return; if (terminus != null) MonoisotopicMass -= terminus.MonoisotopicMass; @@ -285,24 +269,20 @@ private bool ReplaceTerminus(ref IHasChemicalFormula terminus, IHasChemicalFormu if (value != null) MonoisotopicMass += value.MonoisotopicMass; - - return true; } /// - /// Parses a string sequence of nucleic acids characters into a peptide object + /// Parses a string sequence of nucleic acid characters into an array of Nucleotide objects, + /// updates the sequence string, and calculates the monoisotopic mass. /// - /// - /// - private bool ParseSequence(string sequence) + /// The string sequence of nucleic acid characters to parse. + private void ParseSequenceString(string sequence) { if (string.IsNullOrEmpty(sequence)) - return false; + return; int index = 0; - double monoMass = 0; - ChemicalFormula chemFormula = new(); StringBuilder sb = null; sb = new StringBuilder(sequence.Length); @@ -337,8 +317,6 @@ private bool ParseSequence(string sequence) _sequence = sb.ToString(); MonoisotopicMass += monoMass; Array.Resize(ref _nucleicAcids, Length); - - return true; } #endregion @@ -349,7 +327,8 @@ public bool Equals(NucleicAcid? other) { if (ReferenceEquals(null, other)) return false; if (ReferenceEquals(this, other)) return true; - return _5PrimeTerminus.Equals(other._5PrimeTerminus) + return _sequence == other._sequence + && _5PrimeTerminus.Equals(other._5PrimeTerminus) && _3PrimeTerminus.Equals(other._3PrimeTerminus); } From 94d8bfad98ac50447fe231dbdbe11b12ca8d44fe Mon Sep 17 00:00:00 2001 From: Nic Bollis Date: Tue, 24 Sep 2024 14:52:08 -0500 Subject: [PATCH 07/17] Add RNA support: loading, parsing, and decoy generation Introduced support for handling RNA data within the UsefulProteomicsDatabases project. Key changes include: - Added `Transcriptomics\TestData` folder to `Test.csproj`. - Changed access modifiers in `ProteinDbLoader.cs` to internal. - Added `using` directives for `Transcriptomics` in `ProteinXmlEntry.cs`. - Introduced methods `ParseRnaEndElement` and `ParseRnaEntryEndElement` in `ProteinXmlEntry.cs`. - Modified `ParseAnnotatedMods` to check for RNA modifications. - Added project reference to `Transcriptomics.csproj` in `UsefulProteomicsDatabases.csproj`. - Added `ClassExtensions.cs` with `CreateNew` method for nucleic acids. - Added `RnaDbLoader.cs` for RNA database loading. - Added `RnaDecoyGenerator.cs` for generating decoy RNA sequences. --- mzLib/Test/Test.csproj | 1 + mzLib/Transcriptomics/ClassExtensions.cs | 81 ++++++ .../ProteinDbLoader.cs | 6 +- .../ProteinXmlEntry.cs | 58 ++++- .../Transcriptomics/RnaDbLoader.cs | 245 ++++++++++++++++++ .../Transcriptomics/RnaDecoyGenerator.cs | 88 +++++++ .../UsefulProteomicsDatabases.csproj | 1 + 7 files changed, 475 insertions(+), 5 deletions(-) create mode 100644 mzLib/Transcriptomics/ClassExtensions.cs create mode 100644 mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs create mode 100644 mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDecoyGenerator.cs diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index b58d87522..a4cf4545d 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -513,6 +513,7 @@ + diff --git a/mzLib/Transcriptomics/ClassExtensions.cs b/mzLib/Transcriptomics/ClassExtensions.cs new file mode 100644 index 000000000..a61c5c837 --- /dev/null +++ b/mzLib/Transcriptomics/ClassExtensions.cs @@ -0,0 +1,81 @@ +using Omics.Modifications; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Transcriptomics.Digestion; + +namespace Transcriptomics +{ + public static class ClassExtensions + { + /// + /// Creates a new instance of a nucleic acid or oligo with set modifications, optionally updating its sequence, modifications, and decoy status. + /// + /// The type of the nucleic acid, which must implement . + /// The target nucleic acid or oligo with set modifications to base the new instance on. + /// The new sequence string, if any. If null, the original sequence is used. + /// A dictionary of modifications to apply, if any. If null, the original modifications are used. + /// A flag indicating whether the sequence is a decoy, if any. If null, the original decoy status is used. + /// A new instance of the specified nucleic acid type with the provided or existing properties. + /// + /// This method facilitates the generation of new sequences for both nucleic acids and oligos with set modifications by allowing + /// optional updates to the sequence string, modifications, and decoy status. It ensures that the new instances are properly + /// initialized with the provided or existing properties, enabling further analysis of modified sequences and future generation of decoys on the fly. + /// + public static T CreateNew(this T target, string? sequence = null, IDictionary>? modifications = null, + bool? isDecoy = null) + where T : INucleicAcid + { + // set new object parameters where not null + object? returnObj = null; + string newSequence = sequence ?? target.BaseSequence; + IDictionary> newModifications = modifications ?? target.OneBasedPossibleLocalizedModifications; + + switch (target) + { + case RNA rna: + { + bool newIsDecoy = isDecoy ?? rna.IsDecoy; + returnObj = new RNA(newSequence, rna.Name, rna.Accession, rna.Organism, rna.DatabaseFilePath, + rna.FivePrimeTerminus, rna.ThreePrimeTerminus, newModifications, rna.IsContaminant, newIsDecoy, rna.AdditionalDatabaseFields); + break; + } + case OligoWithSetMods oligo: + { + var oldParent = oligo.Parent as RNA ?? throw new NullReferenceException(); + var newParent = new RNA( + newSequence, + oldParent.Name, + oldParent.Accession, + oldParent.Organism, + oldParent.DatabaseFilePath, + oldParent.FivePrimeTerminus, + oldParent.ThreePrimeTerminus, + newModifications, + oldParent.IsContaminant, + oldParent.IsDecoy, + oldParent.AdditionalDatabaseFields); + + returnObj = new OligoWithSetMods( + newParent, + (oligo.DigestionParams as RnaDigestionParams)!, + oligo.OneBasedStartResidue, + oligo.OneBasedEndResidue, + oligo.MissedCleavages, + oligo.CleavageSpecificityForFdrCategory, + newModifications.ToDictionary(p => p.Key, p => p.Value.First()), + oligo.NumFixedMods, + oligo.FivePrimeTerminus, + oligo.ThreePrimeTerminus); + break; + } + default: + throw new ArgumentException("INucleicAcid type not yet implemented"); + } + + return (T)returnObj ?? throw new NullReferenceException("Error creating new INucleicAcid"); + } + } +} diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs index 8544c2233..b5a680a5e 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs @@ -402,7 +402,7 @@ public static IEnumerable MergeProteins(IEnumerable mergeThese } } - private static string ApplyRegex(FastaHeaderFieldRegex regex, string line) + internal static string ApplyRegex(FastaHeaderFieldRegex regex, string line) { string result = null; if (regex != null) @@ -416,7 +416,7 @@ private static string ApplyRegex(FastaHeaderFieldRegex regex, string line) return result; } - private static Dictionary> GetModificationDict(IEnumerable mods) + internal static Dictionary> GetModificationDict(IEnumerable mods) { var mod_dict = new Dictionary>(); @@ -436,7 +436,7 @@ private static Dictionary> GetModificationDict(IEnum return mod_dict; } - private static Dictionary GetModificationDictWithMotifs(IEnumerable mods) + internal static Dictionary GetModificationDictWithMotifs(IEnumerable mods) { var mod_dict = new Dictionary(); diff --git a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs index a93c896e7..698a1c51c 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs @@ -5,6 +5,8 @@ using System.Text.RegularExpressions; using System.Xml; using Omics.Modifications; +using Transcriptomics; +using UsefulProteomicsDatabases.Transcriptomics; namespace UsefulProteomicsDatabases { @@ -182,6 +184,38 @@ public Protein ParseEndElement(XmlReader xml, IEnumerable modTypesToExcl return protein; } + internal RNA ParseRnaEndElement(XmlReader xml, IEnumerable modTypesToExclude, + Dictionary unknownModifications, + bool isContaminant, string rnaDbLocation) + { + RNA result = null; + if (xml.Name == "feature") + { + ParseFeatureEndElement(xml, modTypesToExclude, unknownModifications); + } + if (xml.Name == "subfeature") + { + ParseSubFeatureEndElement(xml, modTypesToExclude, unknownModifications); + } + else if (xml.Name == "dbReference") + { + ParseDatabaseReferenceEndElement(xml); + } + else if (xml.Name == "gene") + { + ReadingGene = false; + } + else if (xml.Name == "organism") + { + ReadingOrganism = false; + } + else if (xml.Name == "entry") + { + result = ParseRnaEntryEndElement(xml, isContaminant, rnaDbLocation, modTypesToExclude, unknownModifications); + } + return result; + } + /// /// Finish parsing an entry /// @@ -202,6 +236,24 @@ public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string pr return result; } + internal RNA ParseRnaEntryEndElement(XmlReader xml, bool isContaminant, string rnaDbLocation, + IEnumerable modTypesToExclude, Dictionary unknownModifications) + { + RNA result = null; + if (Accession != null && Sequence != null) + { + // sanitize the sequence to replace unexpected characters with X (unknown amino acid) + // sometimes strange characters get added by RNA sequencing software, etc. + Sequence = ProteinDbLoader.SanitizeAminoAcidSequence(Sequence, 'X'); + + ParseAnnotatedMods(OneBasedModifications, modTypesToExclude, unknownModifications, AnnotatedMods); + result = new RNA(Sequence, Name, Accession, Organism, rnaDbLocation, null, + null, OneBasedModifications, isContaminant, false, null); + } + Clear(); + return result; + } + /// /// Finish parsing a subfeature element /// @@ -304,7 +356,8 @@ private static void ParseAnnotatedMods(Dictionary> desti string annotatedId = annotatedMod.Item2; int annotatedModLocation = annotatedMod.Item1; - if (ProteinDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out Modification foundMod)) + if (ProteinDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out Modification foundMod) + || RnaDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out foundMod)) { // if the list of known mods contains this IdWithMotif if (!modTypesToExclude.Contains(foundMod.ModificationType)) @@ -322,7 +375,8 @@ private static void ParseAnnotatedMods(Dictionary> desti } // no known mod - try looking it up in the dictionary of mods without motif appended - else if (ProteinDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out IList mods)) + else if (ProteinDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out IList mods) + || RnaDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out mods)) { foreach (Modification mod in mods) { diff --git a/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs new file mode 100644 index 000000000..1828f7b4c --- /dev/null +++ b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs @@ -0,0 +1,245 @@ +using Omics.Modifications; +using System; +using System.Collections.Generic; +using System.IO.Compression; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using System.Xml; +using Chemistry; +using Transcriptomics; + +namespace UsefulProteomicsDatabases.Transcriptomics +{ + public enum RnaFastaHeaderType + { + Modomics, + Unknown, + } + + public static class RnaDbLoader + { + + public static readonly Dictionary ModomicsFieldRegexes = + new Dictionary() + { + { "Id", new FastaHeaderFieldRegex("Id", @"id:(?.+?)\|", 0, 1) }, + { "Name", new FastaHeaderFieldRegex("Name", @"Name:(?.+?)\|", 0, 1) }, + { "SOterm", new FastaHeaderFieldRegex("SOterm", @"SOterm:(?.+?)\|", 0, 1) }, + { "Type", new FastaHeaderFieldRegex("Type", @"Type:(?.+?)\|", 0, 1) }, + { "Subtype", new FastaHeaderFieldRegex("Subtype", @"Subtype:(?.+?)\|", 0, 1) }, + { "Feature", new FastaHeaderFieldRegex("Feature", @"Feature:(?.+?)\|", 0, 1) }, + { "Organism", new FastaHeaderFieldRegex("Organism", @"Species:(?.+?)$", 0, 1) }, + { "Cellular Localization", new FastaHeaderFieldRegex("CellularLocalization", @"Cellular_Localization:(?.+?)\|", 0, 1) }, + }; + + + public static List LoadRnaFasta(string rnaDbLocation, bool generateTargets, DecoyType decoyType, + bool isContaminant, out List errors, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null) + { + RnaFastaHeaderType? headerType = null; + Regex substituteWhitespace = new Regex(@"\s+"); + errors = new List(); + List targets = new List(); + string identifierHeader = null; + + string name = null; + string organism = null; + string identifier = null; + + string newDbLocation = rnaDbLocation; + + //we had trouble decompressing and streaming on the fly so we decompress completely first, then stream the file, then delete the decompressed file + if (rnaDbLocation.EndsWith(".gz")) + { + newDbLocation = Path.Combine(Path.GetDirectoryName(rnaDbLocation), "temp.fasta"); + using var stream = new FileStream(rnaDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read); + using FileStream outputFileStream = File.Create(newDbLocation); + using var decompressor = new GZipStream(stream, CompressionMode.Decompress); + decompressor.CopyTo(outputFileStream); + } + + using (var fastaFileStream = new FileStream(newDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read)) + { + StringBuilder sb = null; + StreamReader fasta = new StreamReader(fastaFileStream); + Dictionary regexResults = new(); + Dictionary regexes = null; + + while (true) + { + string line = ""; + line = fasta.ReadLine(); + if (line == null) { break; } + + if (line.StartsWith(">")) + { + if (headerType is null) + { + headerType = DetectFastaHeaderType(line); + + switch (headerType) + { + case RnaFastaHeaderType.Modomics: + regexes = ModomicsFieldRegexes; + identifierHeader = "SOterm"; + break; + + case RnaFastaHeaderType.Unknown: + case null: + default: + throw new MzLibUtil.MzLibException("Unknown fasta header format: " + line); + } + } + + + regexResults = ParseRegexFields(line, regexes); + name = regexResults["Name"]; + regexResults.Remove("Name"); + organism = regexResults["Organism"]; + regexResults.Remove("Organism"); + identifier = regexResults[identifierHeader]; + regexResults.Remove(identifierHeader); + + sb = new StringBuilder(); + } + else if (sb is not null) + { + sb.Append(line.Trim()); + } + + if ((fasta.Peek() == '>' || fasta.Peek() == -1) /*&& accession != null*/ && sb != null) + { + string sequence = substituteWhitespace.Replace(sb.ToString(), ""); + Dictionary additonalDatabaseFields = + regexResults.ToDictionary(x => x.Key, x => x.Value); + + // Do we need to sanitize the sequence? + + RNA rna = new RNA(sequence, name, identifier, organism, rnaDbLocation, + fivePrimeTerm, threePrimeTerm, null, + isContaminant, false, additonalDatabaseFields); + if (rna.Length == 0) + errors.Add("Line" + line + ", Rna length of 0: " + rna.Name + "was skipped from database: " + rnaDbLocation); + else + targets.Add(rna); + + name = null; + organism = null; + identifier = null; + regexResults.Clear(); + } + + // no input left + if (fasta.Peek() == -1) + { + break; + } + } + } + + if (newDbLocation != rnaDbLocation) + File.Delete(newDbLocation); + + if (!targets.Any()) + errors.Add("No targets were loaded from database: " + rnaDbLocation); + + List decoys = RnaDecoyGenerator.GenerateDecoys(targets, decoyType); + return generateTargets ? targets.Concat(decoys).ToList() : decoys; + } + + private static RnaFastaHeaderType DetectFastaHeaderType(string line) + { + if (!line.StartsWith(">")) + return RnaFastaHeaderType.Unknown; + + + return RnaFastaHeaderType.Modomics; + } + + private static Dictionary ParseRegexFields(string line, + Dictionary regexes) + { + Dictionary fields = new Dictionary(); + + foreach (var regex in regexes) + { + string match = ProteinDbLoader.ApplyRegex(regex.Value, line); + fields.Add(regex.Key, match); + } + + return fields; + } + + + public static Dictionary> IdToPossibleMods = new Dictionary>(); + public static Dictionary IdWithMotifToMod = new Dictionary(); + + public static List LoadRnaXML(string rnaDbLocation, bool generateTargets, DecoyType decoyType, + bool isContaminant, IEnumerable allKnownModifications, + IEnumerable modTypesToExclude, out Dictionary unknownModifications, + int maxThreads = 1, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null) + { + var prespecified = ProteinDbLoader.GetPtmListFromProteinXml(rnaDbLocation); + allKnownModifications = allKnownModifications ?? new List(); + modTypesToExclude = modTypesToExclude ?? new List(); + + if (prespecified.Count > 0 || allKnownModifications.Count() > 0) + { + //modsDictionary = GetModificationDict(new HashSet(prespecified.Concat(allKnownModifications))); + IdToPossibleMods = ProteinDbLoader.GetModificationDict(new HashSet(prespecified.Concat(allKnownModifications))); + IdWithMotifToMod = ProteinDbLoader.GetModificationDictWithMotifs(new HashSet(prespecified.Concat(allKnownModifications))); + } + List targets = new List(); + unknownModifications = new Dictionary(); + + string newProteinDbLocation = rnaDbLocation; + + //we had trouble decompressing and streaming on the fly so we decompress completely first, then stream the file, then delete the decompressed file + if (rnaDbLocation.EndsWith(".gz")) + { + newProteinDbLocation = Path.Combine(Path.GetDirectoryName(rnaDbLocation), "temp.xml"); + using var stream = new FileStream(rnaDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read); + using FileStream outputFileStream = File.Create(newProteinDbLocation); + using var decompressor = new GZipStream(stream, CompressionMode.Decompress); + decompressor.CopyTo(outputFileStream); + } + + using (var uniprotXmlFileStream = new FileStream(newProteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read)) + { + Regex substituteWhitespace = new Regex(@"\s+"); + + ProteinXmlEntry block = new ProteinXmlEntry(); + + using (XmlReader xml = XmlReader.Create(uniprotXmlFileStream)) + { + while (xml.Read()) + { + if (xml.NodeType == XmlNodeType.Element) + { + block.ParseElement(xml.Name, xml); + } + if (xml.NodeType == XmlNodeType.EndElement || xml.IsEmptyElement) + { + RNA newProtein = block.ParseRnaEndElement(xml, modTypesToExclude, unknownModifications, isContaminant, rnaDbLocation); + if (newProtein != null) + { + targets.Add(newProtein); + } + } + } + } + } + if (newProteinDbLocation != rnaDbLocation) + { + File.Delete(newProteinDbLocation); + } + + List decoys = RnaDecoyGenerator.GenerateDecoys(targets, decoyType, maxThreads); + IEnumerable proteinsToExpand = generateTargets ? targets.Concat(decoys) : decoys; + return proteinsToExpand.ToList(); + } + } +} diff --git a/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDecoyGenerator.cs b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDecoyGenerator.cs new file mode 100644 index 000000000..6bd25e31f --- /dev/null +++ b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDecoyGenerator.cs @@ -0,0 +1,88 @@ +using Proteomics; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using MassSpectrometry; +using Omics.Modifications; +using Transcriptomics; + +namespace UsefulProteomicsDatabases.Transcriptomics +{ + /// + /// Provides methods for generating decoy nucleic acids from any implementor of . + /// + /// + /// This class supports various types of decoy generation, including reversing, sliding, and shuffling sequences. + /// It allows for the creation of decoy sequences while preserving certain characteristics such as modification sites and termini. + /// The GenerateDecoys method serves as the main entry point, delegating to specific decoy generation methods based on the specified . + /// TODO: Implement Shuffle and Slide Decoys + /// TODO: Consider passing digestion motif as optional parameter to leave digestion sites intact. Currently leaving the 3' intact as it is the predominant cleavage motif. + /// + public static class RnaDecoyGenerator + { + public static List GenerateDecoys(List nucleicAcids, DecoyType decoyType, int maxThreads = -1) where T : INucleicAcid + { + switch (decoyType) + { + case DecoyType.None: + return new List(); + case DecoyType.Reverse: + return GenerateReverseDecoys(nucleicAcids, maxThreads); + case DecoyType.Slide: + return GenerateSlidedDecoys(nucleicAcids, maxThreads); + case DecoyType.Shuffle: + return GenerateShuffledDeocys(nucleicAcids, maxThreads); + case DecoyType.Random: + default: + throw new ArgumentOutOfRangeException(nameof(decoyType), decoyType, null); + } + } + + /// + /// Generated decoys in which the sequence is reversed, + /// leaving modification on their nucleic acid of origin, + /// and 3' termini intact as it is the most likely cleavage site. + /// + /// + /// + /// + private static List GenerateReverseDecoys(List nucleicAcids, int maxThreads = -1) where T : INucleicAcid + { + List decoyNucleicAcids = new List(); + Parallel.ForEach(nucleicAcids, new ParallelOptions() { MaxDegreeOfParallelism = maxThreads }, nucleicAcid => + { + // reverse sequence + var reverseSequence = + new string(nucleicAcid.BaseSequence[..^1].Reverse().Append(nucleicAcid.BaseSequence.Last()).ToArray()); + + // reverse modifications + var reverseModifications = new Dictionary>(); + foreach (var kvp in nucleicAcid.OneBasedPossibleLocalizedModifications) + { + var reverseKey = kvp.Key == reverseSequence.Length ? kvp.Key : reverseSequence.Length - kvp.Key; + reverseModifications.Add(reverseKey, kvp.Value); + } + + T newNucleicAcid = nucleicAcid.CreateNew(reverseSequence, reverseModifications, true); + lock (decoyNucleicAcids) + { + decoyNucleicAcids.Add(newNucleicAcid); + } + }); + return decoyNucleicAcids; + } + + private static List GenerateSlidedDecoys(List nucleicAcids, int maxThreads = -1) where T : INucleicAcid + { + throw new NotImplementedException(); + } + + private static List GenerateShuffledDeocys(List nucleicAcids, int maxThreads = -1) where T : INucleicAcid + { + throw new NotImplementedException(); + } + + } +} diff --git a/mzLib/UsefulProteomicsDatabases/UsefulProteomicsDatabases.csproj b/mzLib/UsefulProteomicsDatabases/UsefulProteomicsDatabases.csproj index d4e73fa42..f057fc396 100644 --- a/mzLib/UsefulProteomicsDatabases/UsefulProteomicsDatabases.csproj +++ b/mzLib/UsefulProteomicsDatabases/UsefulProteomicsDatabases.csproj @@ -19,6 +19,7 @@ + From c32ec7bb64f4e3e56da539a9c92fda61ab25aab5 Mon Sep 17 00:00:00 2001 From: Nic Bollis Date: Tue, 24 Sep 2024 15:34:04 -0500 Subject: [PATCH 08/17] Add new properties and caching to oligo digestion Updated `using` directives in `TestDigestion.cs` and `OligoWithSetMods.cs` to include necessary namespaces. Added assertions in `TestDigestion.cs` for `SequenceWithChemicalFormulas` and `FullSequenceWithMassShift`. Changed `namespace` in `OligoWithSetMods.cs` to `Transcriptomics.Digestion`. Implemented and cached `SequenceWithChemicalFormulas` property in `OligoWithSetMods.cs`. --- mzLib/Test/Transcriptomics/TestDigestion.cs | 6 +++ .../Digestion/OligoWithSetMods.cs | 38 ++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/mzLib/Test/Transcriptomics/TestDigestion.cs b/mzLib/Test/Transcriptomics/TestDigestion.cs index d210b6a2e..0a9c526eb 100644 --- a/mzLib/Test/Transcriptomics/TestDigestion.cs +++ b/mzLib/Test/Transcriptomics/TestDigestion.cs @@ -6,6 +6,7 @@ using Chemistry; using MassSpectrometry; using NUnit.Framework; +using Omics; using Omics.Digestion; using Omics.Fragmentation; using Omics.Modifications; @@ -368,7 +369,12 @@ public static void TestTermini_ThreePrimeCyclicPhosphate() .Select(p => (OligoWithSetMods)p).ToList(); Assert.That(digestionProducts.Count, Is.EqualTo(2)); Assert.That(digestionProducts[0].FullSequence, Is.EqualTo("UAGUCGUUGAUAG")); + Assert.That(digestionProducts[0].SequenceWithChemicalFormulas, Is.EqualTo("UAGUCGUUGAUAG")); + Assert.That(digestionProducts[0].FullSequenceWithMassShift(), Is.EqualTo("UAGUCGUUGAUAG")); + Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG[Digestion Termini:Cyclic Phosphate on X]")); + Assert.That(digestionProducts[1].SequenceWithChemicalFormulas, Is.EqualTo("UAGUCGUUGAUAG[H-2O-1]")); + Assert.That(digestionProducts[1].FullSequenceWithMassShift(), Is.EqualTo("UAGUCGUUGAUAG[-18.010565]")); // top-down digestion, 3' oligo terminal modification variableMods = new List { oligoCyclicPhosphate }; diff --git a/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs index f5e51c19c..2ef0ae3f2 100644 --- a/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs +++ b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs @@ -6,10 +6,10 @@ using Omics; using Easy.Common.Extensions; using Omics.Fragmentation.Oligo; +using System.Text; namespace Transcriptomics.Digestion { - /// /// Represents an oligonucleotide with set modifications, providing properties and methods for /// accessing and manipulating its chemical characteristics. @@ -63,6 +63,7 @@ public OligoWithSetMods(string sequence, Dictionary allKno private ChemicalFormula? _thisChemicalFormula; private double? _mostAbundantMonoisotopicMass; private IDictionary>? _oneBasedPossibleLocalizedModifications; + private string? _sequenceWithChemicalFormula; public string FullSequence { get; private set; } public IDigestionParams DigestionParams => _digestionParams; @@ -136,7 +137,40 @@ public double MostAbundantMonoisotopicMass } } - public string SequenceWithChemicalFormulas => throw new NotImplementedException(); + public string SequenceWithChemicalFormulas + { + get + { + if (_sequenceWithChemicalFormula is not null) return _sequenceWithChemicalFormula; + + var subsequence = new StringBuilder(); + // variable modification on peptide N-terminus + if (AllModsOneIsNterminus.TryGetValue(1, out Modification? pepNTermVariableMod)) + { + if (pepNTermVariableMod is { } mod) + subsequence.Append('[' + mod.ChemicalFormula.Formula + ']'); + } + + for (int r = 0; r < Length; r++) + { + subsequence.Append(this[r]); + // variable modification on this residue + if (!AllModsOneIsNterminus.TryGetValue(r + 2, out Modification? residueVariableMod)) continue; + if (residueVariableMod is { } mod) + subsequence.Append('[' + mod.ChemicalFormula.Formula + ']'); + } + + // variable modification on peptide C-terminus + if (AllModsOneIsNterminus.TryGetValue(Length + 2, out Modification? pepCTermVariableMod)) + { + if (pepCTermVariableMod is { } mod) + subsequence.Append('[' + mod.ChemicalFormula.Formula + ']'); + } + + _sequenceWithChemicalFormula = subsequence.ToString(); + return _sequenceWithChemicalFormula; + } + } public Dictionary AllModsOneIsNterminus => _allModsOneIsNterminus; From c29b02380ce0113808679e2d826ca8ecae2dc265 Mon Sep 17 00:00:00 2001 From: Nic Bollis Date: Tue, 24 Sep 2024 16:44:23 -0500 Subject: [PATCH 09/17] Add RNA sequence and database handling and related test cases - Added new files `ModomicsUnmodifiedTrimmed.fasta` and `ModomicsUnmodifiedTrimmed.fasta.gz` to `Test.csproj` with `CopyToOutputDirectory` set to `PreserveNewest`. - Removed the `Transcriptomics\TestData` folder from `Test.csproj`. - Introduced `Transcribe` method in `ClassExtensions.cs` for DNA to RNA transcription. - Added summary comment to `NucleolyticOligo` class in `NucleolyticOligo.cs`. - Added `ApplyRegex` method in `FastaHeaderFieldRegex.cs`. - Introduced `ProteinDbWriter` class in `ProteinDbWriter.cs` for writing protein and nucleic acid databases. - Modified `GetModsForThisProtein` to `GetModsForThisBioPolymer` in `ProteinDbWriter.cs`. - Added `RnaDbLoader` class in `RnaDbLoader.cs` for RNA FASTA header detection and sequence loading. - Updated user dictionary in `mzLib.sln.DotSettings` with new terms. - Added test cases in `TestDbLoader.cs` for RNA database loading and header detection. - Introduced `TestDecoyGeneration` class in `TestDecoyGenerator.cs` for RNA decoy generation tests. - Added RNA sequence file `ModomicsUnmodifiedTrimmed.fasta` and its compressed version. --- mzLib/Test/Test.csproj | 7 +- .../TestData/ModomicsUnmodifiedTrimmed.fasta | 10 + .../ModomicsUnmodifiedTrimmed.fasta.gz | Bin 0 -> 369 bytes mzLib/Test/Transcriptomics/TestDbLoader.cs | 149 ++++++++++ .../Transcriptomics/TestDecoyGenerator.cs | 227 ++++++++++++++ mzLib/Transcriptomics/ClassExtensions.cs | 40 +++ .../Digestion/NucleolyticOligo.cs | 3 + .../FastaHeaderFieldRegex.cs | 12 + .../ProteinDbWriter.cs | 280 +++++++++++++++++- .../Transcriptomics/RnaDbLoader.cs | 48 ++- mzLib/mzLib.sln.DotSettings | 2 + 11 files changed, 757 insertions(+), 21 deletions(-) create mode 100644 mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta create mode 100644 mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta.gz create mode 100644 mzLib/Test/Transcriptomics/TestDbLoader.cs create mode 100644 mzLib/Test/Transcriptomics/TestDecoyGenerator.cs diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index a4cf4545d..76ab4ac9b 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -494,6 +494,12 @@ Always + + + PreserveNewest + + + PreserveNewest Always @@ -513,7 +519,6 @@ - diff --git a/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta b/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta new file mode 100644 index 000000000..18802a82a --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta @@ -0,0 +1,10 @@ +>id:1|Name:tdbR00000010|SOterm:SO:0000254|Type:tRNA|Subtype:Ala|Feature:VGC|Cellular_Localization:prokaryotic cytosol|Species:Escherichia coli +GGGGCUAUAGCUCAGCUGGGAGAGCGCCUGCUUUGCACGCAGGAGGUCUGCGGUUCGAUCCCGCAUAGCUCCACCA +>id:2|Name:tdbR00000008|SOterm:SO:0000254|Type:tRNA|Subtype:Ala|Feature:GGC|Cellular_Localization:prokaryotic cytosol|Species:Escherichia coli +GGGGCUAUAGCUCAGCUGGGAGAGCGCUUGCAUGGCAUGCAAGAGGUCAGCGGUUCGAUCCCGCUUAGCUCCACCA +>id:3|Name:tdbR00000356|SOterm:SO:0001036|Type:tRNA|Subtype:Arg|Feature:ICG|Cellular_Localization:prokaryotic cytosol|Species:Escherichia coli +GCAUCCGUAGCUCAGCUGGAUAGAGUACUCGGCUACGAACCGAGCGGUCGGAGGUUCGAAUCCUCCCGGAUGCACCA +>id:4|Name:tdbR00000359|SOterm:SO:0001036|Type:tRNA|Subtype:Arg|Feature:CCG|Cellular_Localization:prokaryotic cytosol|Species:Escherichia coli +GCGCCCGUAGCUCAGCUGGAUAGAGCGCUGCCCUCCGGAGGCAGAGGUCUCAGGUUCGAAUCCUGUCGGGCGCGCCA +>id:5|Name:tdbR00000358|SOterm:SO:0001036|Type:tRNA|Subtype:Arg|Feature:UCU|Cellular_Localization:prokaryotic cytosol|Species:Escherichia coli +GCGCCCUUAGCUCAGUUGGAUAGAGCAACGACCUUCUAAGUCGUGGGCCGCAGGUUCGAAUCCUGCAGGGCGCGCCA diff --git a/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta.gz b/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta.gz new file mode 100644 index 0000000000000000000000000000000000000000..11ab87ef2427bfc92c81c872815097a5e770d643 GIT binary patch literal 369 zcmV-%0gnD3iwFo?vchEq08MXXZ*6H~b5(9_Z)9m^X=P+oa%pXCWn?a9VRLk00JYN1 zYQr!L0N{I}@6bm`f9ZS}2(g*LXrY;R9jO>1To$mMpPI03Kov(;%Et$3H1(R#o2 zI)1zcqrLml$%SvRj^RtkT}aM!%r1pZ@VQ)DQPOz^JVl60X01r(73 z?Qu6L~~#y9Ka|LZCk52CXQGF&lSp)z8*Q2z4 literal 0 HcmV?d00001 diff --git a/mzLib/Test/Transcriptomics/TestDbLoader.cs b/mzLib/Test/Transcriptomics/TestDbLoader.cs new file mode 100644 index 000000000..6e1725b8f --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestDbLoader.cs @@ -0,0 +1,149 @@ +using NUnit.Framework; +using Omics.Modifications; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using UsefulProteomicsDatabases.Transcriptomics; +using UsefulProteomicsDatabases; +using Transcriptomics; + +namespace Test.Transcriptomics +{ + [TestFixture] + [ExcludeFromCodeCoverage] + internal class TestDbLoader + { + public static string ModomicsUnmodifedFastaPath => Path.Combine(TestContext.CurrentContext.TestDirectory, + "Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta"); + + /// + /// Detect the headertype of the test cases + /// + private static IEnumerable<(string, RnaFastaHeaderType)> DetectHeaderTestCases => + new List<(string, RnaFastaHeaderType)> + { + (Path.Combine(TestContext.CurrentContext.TestDirectory, "DoubleProtease.tsv"), RnaFastaHeaderType.Unknown), + (ModomicsUnmodifedFastaPath, RnaFastaHeaderType.Modomics), + (Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta"), RnaFastaHeaderType.Modomics), + + }; + + /// + /// Test the correctness of checking headertype + /// + /// + [Test] + [TestCaseSource(nameof(DetectHeaderTestCases))] + public static void TestDetectHeaderType((string dbPath, RnaFastaHeaderType headerType) testData) + { + string line = File.ReadLines(testData.dbPath).First(); + if (char.IsDigit(line.First())) + { + line = File.ReadLines(testData.dbPath).Skip(1).First(); + } + var type = RnaDbLoader.DetectRnaFastaHeaderType(line); + Assert.That(testData.headerType, Is.EqualTo(type)); + } + + + [Test] + [TestCase("ModomicsUnmodifiedTrimmed.fasta")] + [TestCase("ModomicsUnmodifiedTrimmed.fasta.gz")] + public static void TestModomicsUnmodifiedFasta(string databaseFileName) + { + var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", + databaseFileName); + var oligos = RnaDbLoader.LoadRnaFasta(dbPath, true, DecoyType.None, false, + out var errors); + Assert.That(errors.Count, Is.EqualTo(0)); + Assert.That(oligos.Count, Is.EqualTo(5)); + Assert.That(oligos.First().BaseSequence, + Is.EqualTo("GGGGCUAUAGCUCAGCUGGGAGAGCGCCUGCUUUGCACGCAGGAGGUCUGCGGUUCGAUCCCGCAUAGCUCCACCA")); + Assert.That(oligos.First().Name, Is.EqualTo("tdbR00000010")); + Assert.That(oligos.First().Accession, Is.EqualTo("SO:0000254")); + Assert.That(oligos.First().Organism, Is.EqualTo("Escherichia coli")); + Assert.That(oligos.First().DatabaseFilePath, Is.EqualTo(dbPath)); + Assert.That(oligos.First().IsContaminant, Is.False); + Assert.That(oligos.First().IsDecoy, Is.False); + Assert.That(oligos.First().AdditionalDatabaseFields!.Count, Is.EqualTo(5)); + Assert.That(oligos.First().AdditionalDatabaseFields!["Id"], Is.EqualTo("1")); + Assert.That(oligos.First().AdditionalDatabaseFields!["Type"], Is.EqualTo("tRNA")); + Assert.That(oligos.First().AdditionalDatabaseFields!["Subtype"], Is.EqualTo("Ala")); + Assert.That(oligos.First().AdditionalDatabaseFields!["Feature"], Is.EqualTo("VGC")); + Assert.That(oligos.First().AdditionalDatabaseFields!["Cellular Localization"], Is.EqualTo("prokaryotic cytosol")); + } + + [Test] + public static void TestContaminantFollowsThrough() + { + var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifedFastaPath, true, DecoyType.None, true, + out var errors); + Assert.That(errors.Count, Is.EqualTo(0)); + Assert.That(oligos.Count, Is.EqualTo(5)); + Assert.That(oligos.First().BaseSequence, + Is.EqualTo("GGGGCUAUAGCUCAGCUGGGAGAGCGCCUGCUUUGCACGCAGGAGGUCUGCGGUUCGAUCCCGCAUAGCUCCACCA")); + Assert.That(oligos.All(p => p.IsContaminant)); + Assert.That(oligos.All(p => !p.IsDecoy)); + } + + [Test] + public static void TestNotGeneratingTargetsOrDecoys() + { + var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifedFastaPath, false, DecoyType.None, true, + out var errors); + Assert.That(errors.Count, Is.EqualTo(0)); + Assert.That(oligos.Count, Is.EqualTo(0)); + } + + [Test] + public static void TestXmlWriterReader() + { + var rna = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifedFastaPath, true, DecoyType.None, false, out var errors); + Assert.That(errors.Count, Is.EqualTo(0)); + + var modString = "ID Methylation\r\nMT Biological\r\nPP Anywhere.\r\nTG G\r\nCF C1H2\r\n" + @"//"; + var methylG = PtmListLoader.ReadModsFromString(modString, out List<(Modification, string)> modsOut).First(); + + Dictionary>> mods = new Dictionary>>(); + mods.Add("SO:0000254", new HashSet>() + { + new Tuple(1, methylG), + new Tuple(3, methylG) + }); + + string outpath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.xml"); + + var xml = ProteinDbWriter.WriteXmlDatabase(mods, rna, outpath); + var temp = RnaDbLoader.LoadRnaXML(outpath, true, DecoyType.None, false, + new List() { methylG }, new List(), out var unknownMods); + + Assert.That(unknownMods.Count, Is.EqualTo(0)); + Assert.That(temp.Count, Is.EqualTo(5)); + var first = temp.First(); + var loadedMods = first.OneBasedPossibleLocalizedModifications; + Assert.That(loadedMods.Count, Is.EqualTo(2)); + Assert.That(loadedMods[1].Count, Is.EqualTo(1)); + Assert.That(loadedMods[3].Count, Is.EqualTo(1)); + Assert.That(loadedMods[1].First().IdWithMotif, Is.EqualTo(methylG.IdWithMotif)); + Assert.That(loadedMods[3].First().IdWithMotif, Is.EqualTo(methylG.IdWithMotif)); + } + + [Test] + [TestCase("ATCG", "AUCG", true)] + [TestCase("ATCG", "UAGC", false)] + [TestCase("ATCGZ", "AUCGZ", true)] + [TestCase("ATCGZ", "UAGCZ", false)] + [TestCase("ATCGACGAATCACGATCAGTCATGCATTGCTAACT", "AUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACU", true)] + [TestCase("ATCGACGAATCACGATCAGTCATGCATTGCTAACT", "UAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGA", false)] + [TestCase("ATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACT", "AUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACUAUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACUAUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACUAUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACUAUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACUAUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACU", true)] + [TestCase("ATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACT", "UAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGAUAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGAUAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGAUAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGAUAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGAUAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGA", false)] + public static void TestTranscribe(string input, string expected, bool isCodingStrand) + { + Assert.That(input.Transcribe(isCodingStrand), Is.EqualTo(expected)); + } + } +} diff --git a/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs b/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs new file mode 100644 index 000000000..ea5b2c22d --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs @@ -0,0 +1,227 @@ +using NUnit.Framework; +using Omics.Modifications; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using NUnit.Framework.Interfaces; +using Transcriptomics; +using UsefulProteomicsDatabases.Transcriptomics; +using UsefulProteomicsDatabases; + +namespace Test.Transcriptomics +{ + [TestFixture] + [ExcludeFromCodeCoverage] + internal class TestDecoyGeneration + { + public static string ModomicsUnmodifiedFastaPath => TestDbLoader.ModomicsUnmodifedFastaPath; + + [Test] + public static void TestReverseDecoy_Simple() + { + var oligos = new List() + { + new RNA("GUUCUG"), + new RNA("GUGCUA"), + }; + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Reverse, 1); + Assert.That(decoys.Count, Is.EqualTo(2)); + Assert.That(decoys[0].BaseSequence, Is.EqualTo("UCUUGG")); + Assert.That(decoys[1].BaseSequence, Is.EqualTo("UCGUGA")); + + var example = oligos.First(); + Assert.That(decoys.All(p => !p.IsContaminant)); + Assert.That(decoys.All(p => p.IsDecoy)); + Assert.That(decoys.All(p => p.DatabaseFilePath == example.DatabaseFilePath)); + Assert.That(decoys.All(p => p.Organism == example.Organism)); + Assert.That(decoys.All(p => p.AdditionalDatabaseFields == example.AdditionalDatabaseFields)); + Assert.That(decoys.All(p => p.Accession == example.Accession)); + Assert.That(decoys.All(p => p.Name == example.Name)); + Assert.That(decoys.All(p => p.Length == example.Length)); + Assert.That(decoys.All(p => Equals(p.FivePrimeTerminus, example.FivePrimeTerminus))); + Assert.That(decoys.All(p => Equals(p.ThreePrimeTerminus, example.ThreePrimeTerminus))); + Assert.That(decoys.All(p => p.OneBasedPossibleLocalizedModifications.Count == example.OneBasedPossibleLocalizedModifications.Count)); + } + + [Test] + [TestCase("GUACUG", 1, "UCAUGG", 5)] + [TestCase("GUACUA", 2, "UCAUGA", 4)] + [TestCase("GUACUA", 3, "UCAUGA", 3)] + [TestCase("GUACUA", 4, "UCAUGA", 2)] + [TestCase("GUCCAA", 5, "ACCUGA", 1)] + [TestCase("GUUCUA", 6, "UCUUGA", 6)] + public static void TestReverseDecoy_SimpleWithMods(string rnaSequence, int modPosition, string expectedDecoySequence, int expectedDecoyModPosition) + { + var mod = new Modification(); + var oligos = new List() + { + new RNA(rnaSequence, null, null, + new Dictionary>() + { { modPosition, new List() { mod } } }), + }; + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Reverse, 1); + Assert.That(decoys.Count, Is.EqualTo(1)); + + var decoy = decoys.First(); + var originalRna = oligos.First(); + Assert.That(decoy.BaseSequence, Is.EqualTo(expectedDecoySequence)); + Assert.That(decoy.OneBasedPossibleLocalizedModifications.Count, Is.EqualTo(1)); + Assert.That(decoy.OneBasedPossibleLocalizedModifications.First().Key, Is.EqualTo(expectedDecoyModPosition)); + Assert.That(decoy.OneBasedPossibleLocalizedModifications.First().Value.Count, Is.EqualTo(1)); + Assert.That(decoy.OneBasedPossibleLocalizedModifications.First().Value.First(), Is.EqualTo(mod)); + Assert.That(decoy.Name, Is.EqualTo(originalRna.Name)); + Assert.That(decoy.Accession, Is.EqualTo(originalRna.Accession)); + Assert.That(decoy.Organism, Is.EqualTo(originalRna.Organism)); + Assert.That(decoy.DatabaseFilePath, Is.EqualTo(originalRna.DatabaseFilePath)); + Assert.That(decoy.IsContaminant, Is.EqualTo(originalRna.IsContaminant)); + Assert.That(decoy.IsDecoy, Is.True); + Assert.That(decoy.AdditionalDatabaseFields, Is.EqualTo(originalRna.AdditionalDatabaseFields)); + Assert.That(decoy.FivePrimeTerminus, Is.EqualTo(originalRna.FivePrimeTerminus)); + Assert.That(decoy.ThreePrimeTerminus, Is.EqualTo(originalRna.ThreePrimeTerminus)); + } + + [Test] + public void TestReverseDecoy_FromDatabase() + { + int numSequences = 5; + Dictionary expectedSequences = new Dictionary() + { + { "tdbR00000010", "CCACCUCGAUACGCCCUAGCUUGGCGUCUGGAGGACGCACGUUUCGUCCGCGAGAGGGUCGACUCGAUAUCGGGGA"}, + { "tdbR00000008", "CCACCUCGAUUCGCCCUAGCUUGGCGACUGGAGAACGUACGGUACGUUCGCGAGAGGGUCGACUCGAUAUCGGGGA"}, + { "tdbR00000356", "CCACGUAGGCCCUCCUAAGCUUGGAGGCUGGCGAGCCAAGCAUCGGCUCAUGAGAUAGGUCGACUCGAUGCCUACGA"}, + { "tdbR00000359", "CCGCGCGGGCUGUCCUAAGCUUGGACUCUGGAGACGGAGGCCUCCCGUCGCGAGAUAGGUCGACUCGAUGCCCGCGA"}, + { "tdbR00000358", "CCGCGCGGGACGUCCUAAGCUUGGACGCCGGGUGCUGAAUCUUCCAGCAACGAGAUAGGUUGACUCGAUUCCCGCGA"}, + }; + + var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifiedFastaPath, true, DecoyType.Reverse, false, + out var errors); + Assert.That(errors.Count, Is.EqualTo(0)); + Assert.That(oligos.Count, Is.EqualTo(numSequences * 2)); + Assert.That(oligos.Count(p => p.IsDecoy), Is.EqualTo(numSequences)); + Assert.That(oligos.Count(p => !p.IsDecoy), Is.EqualTo(numSequences)); + + foreach (var targetDecoyGroup in oligos.GroupBy(p => p.Name)) + { + Assert.That(targetDecoyGroup.Count(), Is.EqualTo(2)); + var target = targetDecoyGroup.First(p => !p.IsDecoy); + var decoy = targetDecoyGroup.First(p => p.IsDecoy); + var expectedSequence = expectedSequences[target.Name]; + + Assert.That(target.FivePrimeTerminus, Is.EqualTo(decoy.FivePrimeTerminus)); + Assert.That(target.ThreePrimeTerminus, Is.EqualTo(decoy.ThreePrimeTerminus)); + Assert.That(target.AdditionalDatabaseFields, Is.EqualTo(decoy.AdditionalDatabaseFields)); + Assert.That(target.IsContaminant, Is.EqualTo(decoy.IsContaminant)); + Assert.That(target.DatabaseFilePath, Is.EqualTo(decoy.DatabaseFilePath)); + Assert.That(target.DatabaseFilePath, Is.EqualTo(ModomicsUnmodifiedFastaPath)); + Assert.That(target.Organism, Is.EqualTo(decoy.Organism)); + Assert.That(target.Accession, Is.EqualTo(decoy.Accession)); + Assert.That(target.Name, Is.EqualTo(decoy.Name)); + Assert.That(target.Length, Is.EqualTo(decoy.Length)); + Assert.That(target.OneBasedPossibleLocalizedModifications.Count, Is.EqualTo(decoy.OneBasedPossibleLocalizedModifications.Count)); + + Assert.That(decoy.BaseSequence, Is.EqualTo(expectedSequence)); + } + } + + + // TODO: Implement these test once other decoy generation methods are availiable + + [Test] + public void TestShuffledDecoy_Simple() + { + var oligos = new List() + { + new RNA("GUACUG"), + new RNA("GUACUA"), + }; + Assert.Throws(() => + { + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Shuffle); + }); + + + //var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Shuffle); + //Assert.That(decoys.Count, Is.EqualTo(2)); + } + + [Test] + public void TestShuffledDecoy_SimpleWithMods() + { + var oligos = new List() + { + new RNA("GUACUG"), + new RNA("GUACUA"), + }; + Assert.Throws(() => + { + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Shuffle); + }); + //var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Shuffle); + //Assert.That(decoys.Count, Is.EqualTo(2)); + } + + [Test] + public void TestShuffledDecoy_FromDatabase() + { + Assert.Throws(() => + { + var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifiedFastaPath, true, DecoyType.Shuffle, false, out var errors); + }); + + //var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifiedFastaPath, true, DecoyType.Shuffle, false, out var errors); + //Assert.That(errors.Count, Is.EqualTo(0)); + //Assert.That(oligos.Count, Is.EqualTo(10)); + } + + [Test] + public void TestSlideDecoy_Simple() + { + var oligos = new List() + { + new RNA("GUACUG"), + new RNA("GUACUA"), + }; + Assert.Throws(() => + { + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Slide); + }); + + //var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Slide); + //Assert.That(decoys.Count, Is.EqualTo(2)); + } + + [Test] + public void TestSlideDecoy_SimpleWithMods() + { + var oligos = new List() + { + new RNA("GUACUG"), + new RNA("GUACUA"), + }; + + Assert.Throws(() => + { + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Slide); + }); + + //var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Slide); + //Assert.That(decoys.Count, Is.EqualTo(2)); + } + + [Test] + public void TestSlideDecoy_FromDatabase() + { + Assert.Throws(() => + { + var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifiedFastaPath, true, DecoyType.Shuffle, false, out var errors); + }); + + //var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifiedFastaPath, true, DecoyType.Slide, false, out var errors); + //Assert.That(errors.Count, Is.EqualTo(0)); + //Assert.That(oligos.Count, Is.EqualTo(10)); + } + } +} diff --git a/mzLib/Transcriptomics/ClassExtensions.cs b/mzLib/Transcriptomics/ClassExtensions.cs index a61c5c837..2ac37a67d 100644 --- a/mzLib/Transcriptomics/ClassExtensions.cs +++ b/mzLib/Transcriptomics/ClassExtensions.cs @@ -77,5 +77,45 @@ public static T CreateNew(this T target, string? sequence = null, IDictionary return (T)returnObj ?? throw new NullReferenceException("Error creating new INucleicAcid"); } + + /// + /// Transcribes a DNA sequence into an RNA sequence + /// + /// The input dna sequence + /// True if the input sequence is the coding strand, False if the input sequence is the template strand + /// + public static string Transcribe(this string dna, bool isCodingStrand = true) + { + var sb = new StringBuilder(); + foreach (var residue in dna) + { + if (isCodingStrand) + { + sb.Append(residue == 'T' ? 'U' : residue); + } + else + { + switch (residue) + { + case 'A': + sb.Append('U'); + break; + case 'T': + sb.Append('A'); + break; + case 'C': + sb.Append('G'); + break; + case 'G': + sb.Append('C'); + break; + default: + sb.Append(residue); + break; + } + } + } + return sb.ToString(); + } } } diff --git a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs index 7f98597a4..a2ad1741b 100644 --- a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs +++ b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs @@ -4,6 +4,9 @@ namespace Transcriptomics.Digestion { + /// + /// The most basic form of a digested oligo, this class does not care about mass or formula, just base sequence + /// public class NucleolyticOligo : DigestionProduct { protected IHasChemicalFormula _fivePrimeTerminus; diff --git a/mzLib/UsefulProteomicsDatabases/FastaHeaderFieldRegex.cs b/mzLib/UsefulProteomicsDatabases/FastaHeaderFieldRegex.cs index b70e3dc23..51978b2db 100644 --- a/mzLib/UsefulProteomicsDatabases/FastaHeaderFieldRegex.cs +++ b/mzLib/UsefulProteomicsDatabases/FastaHeaderFieldRegex.cs @@ -19,5 +19,17 @@ public FastaHeaderFieldRegex(string fieldName, string regularExpression, int mat public int Match { get; } public int Group { get; } + + public string ApplyRegex(string input) + { + string? result = null; + var matches = Regex.Matches(input); + if (matches.Count > Match && matches[Match].Groups.Count > Group) + { + result = matches[Match].Groups[Group].Value; + } + + return result!; + } } } \ No newline at end of file diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs index 155945558..d47912f4d 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs @@ -5,12 +5,283 @@ using System.IO; using System.Linq; using System.Xml; +using Easy.Common.Extensions; +using Omics; using Omics.Modifications; +using Transcriptomics; namespace UsefulProteomicsDatabases { + + /// + /// Provides methods for writing protein and nucleic acid databases to XML and FASTA formats. + /// Did not rename to DbWriter to ensure compatibility with the original UsefulProteomicsDatabases namespace. + /// public class ProteinDbWriter { + /// + /// Writes an XML database for a list of RNA sequences, including additional modifications. + /// + /// A dictionary of additional modifications to add to proteins. + /// A list of RNA sequences to be written to the database. + /// The name of the output XML file. + /// A dictionary of new modification residue entries. + public static Dictionary WriteXmlDatabase( + Dictionary>> additionalModsToAddToProteins, + List bioPolymerList, string outputFileName) => WriteNucleicAcidXmlDatabase(additionalModsToAddToProteins, bioPolymerList.Cast().ToList(), outputFileName); + + /// + /// Writes an XML database for a list of nucleic acid sequences, including additional modifications. + /// + /// A dictionary of additional modifications to add to proteins. + /// A list of nucleic acid sequences to be written to the database. + /// The name of the output XML file. + /// A dictionary of new modification residue entries. + private static Dictionary WriteNucleicAcidXmlDatabase( + Dictionary>> additionalModsToAddToProteins, + List nucleicAcidList, string outputFileName) + { + additionalModsToAddToProteins = additionalModsToAddToProteins ?? new Dictionary>>(); + var xmlWriterSettings = new XmlWriterSettings + { + Indent = true, + IndentChars = " " + }; + + Dictionary newModResEntries = new Dictionary(); + using (XmlWriter writer = XmlWriter.Create(outputFileName, xmlWriterSettings)) + { + writer.WriteStartDocument(); + writer.WriteStartElement("mzLibProteinDb"); + + List myModificationList = new List(); + foreach (var p in nucleicAcidList) + { + foreach (KeyValuePair> entry in p.OneBasedPossibleLocalizedModifications) + { + myModificationList.AddRange(entry.Value); + } + } + + // get modifications from nucleic acid list and concatenate the modifications discovered in GPTMDictionary + var allRelevantModifications = + new HashSet(nucleicAcidList + .SelectMany(p => p.OneBasedPossibleLocalizedModifications.SelectMany(m => m.Value)) + .Concat(additionalModsToAddToProteins + .Where(n => nucleicAcidList.Select(nu => nu.Accession).Contains(n.Key)) + .SelectMany(kv => kv.Value.Select(v => v.Item2)))); + + foreach (Modification mod in allRelevantModifications.OrderBy(m => m.IdWithMotif)) + { + writer.WriteStartElement("modification"); + writer.WriteString(mod.ToString() + Environment.NewLine + "//"); + writer.WriteEndElement(); + } + + foreach (var nucleicAcid in nucleicAcidList) + { + writer.WriteStartElement("entry"); + writer.WriteStartElement("accession"); + writer.WriteString(nucleicAcid.Accession); + writer.WriteEndElement(); + + if (nucleicAcid.Name.IsNotNullOrEmptyOrWhiteSpace()) + { + writer.WriteStartElement("name"); + writer.WriteString(nucleicAcid.Name); + writer.WriteEndElement(); + } + + if (nucleicAcid.FullName.IsNotNullOrEmptyOrWhiteSpace()) + { + writer.WriteStartElement("protein"); + writer.WriteStartElement("recommendedName"); + writer.WriteStartElement("fullName"); + writer.WriteString(nucleicAcid.FullName); + writer.WriteEndElement(); + writer.WriteEndElement(); + writer.WriteEndElement(); + } + + writer.WriteStartElement("gene"); + foreach (var geneName in nucleicAcid.GeneNames) + { + writer.WriteStartElement("name"); + writer.WriteAttributeString("type", geneName.Item1); + writer.WriteString(geneName.Item2); + writer.WriteEndElement(); + } + writer.WriteEndElement(); + + if (nucleicAcid.Organism.IsNotNullOrEmptyOrWhiteSpace()) + { + writer.WriteStartElement("organism"); + writer.WriteStartElement("name"); + writer.WriteAttributeString("type", "scientific"); + writer.WriteString(nucleicAcid.Organism); + writer.WriteEndElement(); + writer.WriteEndElement(); + } + + //foreach (var dbRef in nucleicAcid) + //{ + // writer.WriteStartElement("dbReference"); + // writer.WriteAttributeString("type", dbRef.Type); + // writer.WriteAttributeString("id", dbRef.Id); + // foreach (Tuple property in dbRef.Properties) + // { + // writer.WriteStartElement("property"); + // writer.WriteAttributeString("type", property.Item1); + // writer.WriteAttributeString("value", property.Item2); + // writer.WriteEndElement(); + // } + // writer.WriteEndElement(); + //} + + ////for now we are not going to write top-down truncations generated for top-down truncation search. + ////some day we could write those if observed + ////the truncation designation is contained in the "type" field of ProteolysisProduct + //List proteolysisProducts = nucleicAcid.ProteolysisProducts.Where(p => !p.Type.Contains("truncation")).ToList(); + //foreach (var proteolysisProduct in proteolysisProducts) + //{ + // writer.WriteStartElement("feature"); + // writer.WriteAttributeString("type", proteolysisProduct.Type.Split('(')[0]); + // writer.WriteStartElement("location"); + // writer.WriteStartElement("begin"); + // writer.WriteAttributeString("position", proteolysisProduct.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // writer.WriteStartElement("end"); + // writer.WriteAttributeString("position", proteolysisProduct.OneBasedEndPosition.ToString()); + // writer.WriteEndElement(); + // writer.WriteEndElement(); + // writer.WriteEndElement(); + //} + + foreach (var hm in GetModsForThisBioPolymer(nucleicAcid, null, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key)) + { + foreach (var modId in hm.Value) + { + writer.WriteStartElement("feature"); + writer.WriteAttributeString("type", "modified residue"); + writer.WriteAttributeString("description", modId); + writer.WriteStartElement("location"); + writer.WriteStartElement("position"); + writer.WriteAttributeString("position", hm.Key.ToString(CultureInfo.InvariantCulture)); + writer.WriteEndElement(); + writer.WriteEndElement(); + writer.WriteEndElement(); + } + } + + //foreach (var hm in nucleicAcid.SequenceVariations) + //{ + // writer.WriteStartElement("feature"); + // writer.WriteAttributeString("type", "sequence variant"); + // writer.WriteAttributeString("description", hm.Description.ToString()); + // writer.WriteStartElement("original"); + // writer.WriteString(hm.OriginalSequence); + // writer.WriteEndElement(); // original + // writer.WriteStartElement("variation"); + // writer.WriteString(hm.VariantSequence); + // writer.WriteEndElement(); // variation + // writer.WriteStartElement("location"); + // if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition) + // { + // writer.WriteStartElement("position"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // } + // else + // { + // writer.WriteStartElement("begin"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // writer.WriteStartElement("end"); + // writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString()); + // writer.WriteEndElement(); + // } + // foreach (var hmm in GetModsForThisProtein(nucleicAcid, hm, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key)) + // { + // foreach (var modId in hmm.Value) + // { + // writer.WriteStartElement("subfeature"); + // writer.WriteAttributeString("type", "modified residue"); + // writer.WriteAttributeString("description", modId); + // writer.WriteStartElement("location"); + // writer.WriteStartElement("subposition"); + // writer.WriteAttributeString("subposition", hmm.Key.ToString(CultureInfo.InvariantCulture)); + // writer.WriteEndElement(); + // writer.WriteEndElement(); + // writer.WriteEndElement(); + // } + // } + // writer.WriteEndElement(); // location + // writer.WriteEndElement(); // feature + //} + + //foreach (var hm in nucleicAcid.DisulfideBonds) + //{ + // writer.WriteStartElement("feature"); + // writer.WriteAttributeString("type", "disulfide bond"); + // writer.WriteAttributeString("description", hm.Description); + // writer.WriteStartElement("location"); + // if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition) + // { + // writer.WriteStartElement("position"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // } + // else + // { + // writer.WriteStartElement("begin"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // writer.WriteStartElement("end"); + // writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString()); + // writer.WriteEndElement(); + // } + // writer.WriteEndElement(); // location + // writer.WriteEndElement(); // feature + //} + + //foreach (var hm in nucleicAcid.SpliceSites) + //{ + // writer.WriteStartElement("feature"); + // writer.WriteAttributeString("type", "splice site"); + // writer.WriteAttributeString("description", hm.Description); + // writer.WriteStartElement("location"); + // if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition) + // { + // writer.WriteStartElement("position"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // } + // else + // { + // writer.WriteStartElement("begin"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // writer.WriteStartElement("end"); + // writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString()); + // writer.WriteEndElement(); + // } + // writer.WriteEndElement(); // location + // writer.WriteEndElement(); // feature + //} + + writer.WriteStartElement("sequence"); + writer.WriteAttributeString("length", nucleicAcid.Length.ToString(CultureInfo.InvariantCulture)); + writer.WriteString(nucleicAcid.BaseSequence); + writer.WriteEndElement(); // sequence + writer.WriteEndElement(); // entry + } + + writer.WriteEndElement(); // mzLibProteinDb + writer.WriteEndDocument(); + } + return newModResEntries; + } + /// /// Writes a protein database in mzLibProteinDb format, with additional modifications from the AdditionalModsToAddToProteins list. /// @@ -138,7 +409,7 @@ public static Dictionary WriteXmlDatabase(Dictionary b.Key)) + foreach (var hm in GetModsForThisBioPolymer(protein, null, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key)) { foreach (var modId in hm.Value) { @@ -181,7 +452,7 @@ public static Dictionary WriteXmlDatabase(Dictionary b.Key)) + foreach (var hmm in GetModsForThisBioPolymer(protein, hm, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key)) { foreach (var modId in hmm.Value) { @@ -276,7 +547,7 @@ public static void WriteFastaDatabase(List proteinList, string outputFi } } - private static Dictionary> GetModsForThisProtein(Protein protein, SequenceVariation seqvar, Dictionary>> additionalModsToAddToProteins, Dictionary newModResEntries) + private static Dictionary> GetModsForThisBioPolymer(IBioPolymer protein, SequenceVariation seqvar, Dictionary>> additionalModsToAddToProteins, Dictionary newModResEntries) { var modsToWriteForThisSpecificProtein = new Dictionary>(); @@ -292,7 +563,8 @@ private static Dictionary> GetModsForThisProtein(Protein pr } } - string accession = seqvar == null ? protein.Accession : VariantApplication.GetAccession(protein, new[] { seqvar }); + // This cast to protein is okay as no sequence variation is programmed to RNA as of 9/24/24 + string accession = seqvar == null ? protein.Accession : VariantApplication.GetAccession(protein as Protein, new[] { seqvar }); if (additionalModsToAddToProteins.ContainsKey(accession)) { foreach (var ye in additionalModsToAddToProteins[accession]) diff --git a/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs index 1828f7b4c..13aa82fb2 100644 --- a/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs +++ b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs @@ -22,6 +22,19 @@ public enum RnaFastaHeaderType public static class RnaDbLoader { + #region Header Detection and Property Regexes + + public static RnaFastaHeaderType DetectRnaFastaHeaderType(string line) + { + if (line.StartsWith(">id")) + return RnaFastaHeaderType.Modomics; + + return RnaFastaHeaderType.Unknown; + } + + /// + /// Dictionary that extract accession number, species, name, and additional dataField of modomics + /// public static readonly Dictionary ModomicsFieldRegexes = new Dictionary() { @@ -35,6 +48,20 @@ public static class RnaDbLoader { "Cellular Localization", new FastaHeaderFieldRegex("CellularLocalization", @"Cellular_Localization:(?.+?)\|", 0, 1) }, }; + #endregion + + /// + /// Loads an RNA file from the specified location, optionally generating decoys and adding error tracking + /// + /// The file path to the RNA FASTA database + /// Flag indicating whether to generate targets or not + /// The type of decoy generation to apply + /// Indicates if the RNA sequence is a contaminant + /// Outputs any errors encountered during the process + /// An optional 5' prime chemical modification term + /// An optional 3' prime chemical modification term + /// A list of RNA sequences loaded from the FASTA database + /// Thrown if the FASTA header format is unknown or other issues occur during loading. public static List LoadRnaFasta(string rnaDbLocation, bool generateTargets, DecoyType decoyType, bool isContaminant, out List errors, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null) @@ -78,7 +105,7 @@ public static List LoadRnaFasta(string rnaDbLocation, bool generateTargets, { if (headerType is null) { - headerType = DetectFastaHeaderType(line); + headerType = DetectRnaFastaHeaderType(line); switch (headerType) { @@ -86,9 +113,6 @@ public static List LoadRnaFasta(string rnaDbLocation, bool generateTargets, regexes = ModomicsFieldRegexes; identifierHeader = "SOterm"; break; - - case RnaFastaHeaderType.Unknown: - case null: default: throw new MzLibUtil.MzLibException("Unknown fasta header format: " + line); } @@ -105,9 +129,9 @@ public static List LoadRnaFasta(string rnaDbLocation, bool generateTargets, sb = new StringBuilder(); } - else if (sb is not null) + else { - sb.Append(line.Trim()); + sb?.Append(line.Trim()); } if ((fasta.Peek() == '>' || fasta.Peek() == -1) /*&& accession != null*/ && sb != null) @@ -150,14 +174,7 @@ public static List LoadRnaFasta(string rnaDbLocation, bool generateTargets, return generateTargets ? targets.Concat(decoys).ToList() : decoys; } - private static RnaFastaHeaderType DetectFastaHeaderType(string line) - { - if (!line.StartsWith(">")) - return RnaFastaHeaderType.Unknown; - - - return RnaFastaHeaderType.Modomics; - } + private static Dictionary ParseRegexFields(string line, Dictionary regexes) @@ -166,14 +183,13 @@ private static Dictionary ParseRegexFields(string line, foreach (var regex in regexes) { - string match = ProteinDbLoader.ApplyRegex(regex.Value, line); + string match = regex.Value.ApplyRegex(line); fields.Add(regex.Key, match); } return fields; } - public static Dictionary> IdToPossibleMods = new Dictionary>(); public static Dictionary IdWithMotifToMod = new Dictionary(); diff --git a/mzLib/mzLib.sln.DotSettings b/mzLib/mzLib.sln.DotSettings index 06594535d..6522afcd5 100644 --- a/mzLib/mzLib.sln.DotSettings +++ b/mzLib/mzLib.sln.DotSettings @@ -4,9 +4,11 @@ True True True + True True True True + True True True True From d75f75207b45339b0260d8b2c56cdf64fbe98db5 Mon Sep 17 00:00:00 2001 From: Nic Bollis Date: Tue, 24 Sep 2024 17:49:35 -0500 Subject: [PATCH 10/17] Refactor and enhance RNA and oligo handling in tests - Added `using` directives for `Transcriptomics.Digestion` and `UsefulProteomicsDatabases.Transcriptomics` in `TestDecoyGenerator.cs`. - Introduced `TestCreateNew` in `TestDecoyGenerator.cs` to verify RNA and oligo creation. - Added `using` directive for `MzLibUtil` in `TestDigestion.cs`. - Added a test in `TestDigestion.cs` for exception handling with invalid sequences. - Added `using` directives for `Omics` and related namespaces in `TestFragmentation.cs`. - Modified `TestFragmentation_Modified` in `TestFragmentation.cs` to use `OligoWithSetMods` directly and added assertions. - Updated `ClassExtensions.cs` to allow setting `isDecoy` in new `RNA` objects. - Refactored `OligoWithSetMods.cs` to return a dictionary from `GetModsAfterDeserialization`. - Updated `OligoWithSetMods.cs` to initialize `_allModsOneIsNterminus` using the returned dictionary. --- .../Transcriptomics/TestDecoyGenerator.cs | 38 +++++++++++++++++++ mzLib/Test/Transcriptomics/TestDigestion.cs | 6 +++ .../Test/Transcriptomics/TestFragmentation.cs | 16 ++++++-- mzLib/Transcriptomics/ClassExtensions.cs | 3 +- .../Digestion/OligoWithSetMods.cs | 11 ++++-- 5 files changed, 65 insertions(+), 9 deletions(-) diff --git a/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs b/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs index ea5b2c22d..800126b1a 100644 --- a/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs +++ b/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs @@ -8,6 +8,7 @@ using System.Threading.Tasks; using NUnit.Framework.Interfaces; using Transcriptomics; +using Transcriptomics.Digestion; using UsefulProteomicsDatabases.Transcriptomics; using UsefulProteomicsDatabases; @@ -223,5 +224,42 @@ public void TestSlideDecoy_FromDatabase() //Assert.That(errors.Count, Is.EqualTo(0)); //Assert.That(oligos.Count, Is.EqualTo(10)); } + + + [Test] + public void TestCreateNew() + { + var mods = PtmListLoader.ReadModsFromString( + "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + out List<(Modification, string)> modsOut).ToList(); + var modDict = mods.ToDictionary(p => p.IdWithMotif, p => p); + var oneBasedPossibleLocalizedModifications = new Dictionary>() + { + { 1, new List() { modDict["Sodium on A"] } }, + { 3, new List() { modDict["Sodium on A"] } }, + }; + + var rna = new RNA("GAACUG", "name", "accession", "organism", "databaseFilePath", + null, null, oneBasedPossibleLocalizedModifications, false, false, + new Dictionary()); + var oligos = rna + .Digest(new RnaDigestionParams(maxMods: 1), new List(), mods) + .ToList(); + + var clonedRna = rna.CreateNew(null, null, true); + var clonedOligo = oligos.First().CreateNew(null, null, true); + + // ensure they are identical except for the isDecoy field + // ensure they are identical except for the isDecoy field + Assert.That(rna.BaseSequence, Is.EqualTo(clonedRna.BaseSequence)); + Assert.That(rna.OneBasedPossibleLocalizedModifications, Is.EqualTo(clonedRna.OneBasedPossibleLocalizedModifications)); + Assert.That(rna.IsDecoy, Is.Not.EqualTo(clonedRna.IsDecoy)); + + Assert.That(oligos.First().BaseSequence, Is.EqualTo(clonedOligo.BaseSequence)); + Assert.That(oligos.First().OneBasedPossibleLocalizedModifications, Is.EqualTo(clonedOligo.OneBasedPossibleLocalizedModifications)); + Assert.That(oligos.First().Parent.IsDecoy, Is.Not.EqualTo(clonedOligo.Parent.IsDecoy)); + + + } } } diff --git a/mzLib/Test/Transcriptomics/TestDigestion.cs b/mzLib/Test/Transcriptomics/TestDigestion.cs index 0a9c526eb..37abc0447 100644 --- a/mzLib/Test/Transcriptomics/TestDigestion.cs +++ b/mzLib/Test/Transcriptomics/TestDigestion.cs @@ -5,6 +5,7 @@ using System.Linq; using Chemistry; using MassSpectrometry; +using MzLibUtil; using NUnit.Framework; using Omics; using Omics.Digestion; @@ -533,6 +534,11 @@ public static void OligoWithSetMods_CalculatedValues() Assert.That(oligoWithSetMods.MonoisotopicMass, Is.EqualTo(oldMonoMass + deltaMass).Within(0.01)); Assert.That(oligoWithSetMods.MostAbundantMonoisotopicMass, Is.EqualTo(oldMostAbundantMass + deltaMass).Within(0.01)); Assert.That(oligoWithSetMods.ThisChemicalFormula, Is.EqualTo(formula + formulaToAdd + formulaToAdd)); + + Assert.Throws(() => + { + var oligo = new OligoWithSetMods("GUA|GAUGUC", new Dictionary()); + }); } #endregion diff --git a/mzLib/Test/Transcriptomics/TestFragmentation.cs b/mzLib/Test/Transcriptomics/TestFragmentation.cs index fea764246..76ddb8c3b 100644 --- a/mzLib/Test/Transcriptomics/TestFragmentation.cs +++ b/mzLib/Test/Transcriptomics/TestFragmentation.cs @@ -5,6 +5,7 @@ using System.Linq; using Transcriptomics; using MassSpectrometry; +using Omics; using Omics.Fragmentation; using Omics.Fragmentation.Oligo; using Omics.Modifications; @@ -130,18 +131,25 @@ public void TestFragmentation_Modified(string sequence, string modString, string ProductType productType, double[] unmodifiedFragmentMass, double[] modifiedFragmentMasses) { var mods = PtmListLoader.ReadModsFromString(modString, out List<(Modification, string)> modsOut).ToList(); + var modDict = mods.ToDictionary(p => p.IdWithMotif, p => p); var rna = new RNA(sequence); - var unmodifiedOligo = rna.Digest(new RnaDigestionParams(), new List(), new List()) - .First() as OligoWithSetMods ?? throw new NullReferenceException(); + var unmodifiedOligo = new OligoWithSetMods(sequence, new Dictionary(), + 0, new RnaDigestionParams(), rna, 1, rna.Length); Assert.That(unmodifiedOligo.AllModsOneIsNterminus.Count, Is.EqualTo(0)); Assert.That(unmodifiedOligo.FullSequence, Is.EqualTo(sequence)); + Assert.That(unmodifiedOligo.SequenceWithChemicalFormulas, Is.EqualTo(sequence)); + Assert.That(unmodifiedOligo.FullSequenceWithMassShift(), Is.EqualTo(sequence)); Assert.That(unmodifiedOligo.MonoisotopicMass, Is.EqualTo(unmodifiedMass).Within(0.01)); - var modifiedOligo = rna.Digest(new RnaDigestionParams(), mods, new List()) - .First() as OligoWithSetMods ?? throw new NullReferenceException(); + var modifiedOligo = new OligoWithSetMods(fullSequence, modDict, + 0, new RnaDigestionParams(), rna, 1, rna.Length); + var formulaSequence = fullSequence.Replace("Metal:Sodium on A", "H-1Na"); + var massShiftSequence = fullSequence.Replace("Metal:Sodium on A", "+21.981944"); Assert.That(modifiedOligo.AllModsOneIsNterminus.Count, Is.EqualTo(mods.Count)); Assert.That(modifiedOligo.FullSequence, Is.EqualTo(fullSequence)); + Assert.That(modifiedOligo.SequenceWithChemicalFormulas, Is.EqualTo(formulaSequence)); + Assert.That(modifiedOligo.FullSequenceWithMassShift(), Is.EqualTo(massShiftSequence)); Assert.That(modifiedOligo.MonoisotopicMass, Is.EqualTo(modifiedMass).Within(0.01)); var unmodifiedProducts = unmodifiedOligo.GetNeutralFragments(productType).ToList(); diff --git a/mzLib/Transcriptomics/ClassExtensions.cs b/mzLib/Transcriptomics/ClassExtensions.cs index 2ac37a67d..a5617863e 100644 --- a/mzLib/Transcriptomics/ClassExtensions.cs +++ b/mzLib/Transcriptomics/ClassExtensions.cs @@ -45,6 +45,7 @@ public static T CreateNew(this T target, string? sequence = null, IDictionary case OligoWithSetMods oligo: { var oldParent = oligo.Parent as RNA ?? throw new NullReferenceException(); + bool newIsDecoy = isDecoy ?? oldParent.IsDecoy; var newParent = new RNA( newSequence, oldParent.Name, @@ -55,7 +56,7 @@ public static T CreateNew(this T target, string? sequence = null, IDictionary oldParent.ThreePrimeTerminus, newModifications, oldParent.IsContaminant, - oldParent.IsDecoy, + newIsDecoy, oldParent.AdditionalDatabaseFields); returnObj = new OligoWithSetMods( diff --git a/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs index 2ef0ae3f2..f432b5fed 100644 --- a/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs +++ b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs @@ -49,9 +49,10 @@ public OligoWithSetMods(string sequence, Dictionary allKno FullSequence = sequence; _baseSequence = IBioPolymerWithSetMods.GetBaseSequenceFromFullSequence(sequence); - GetModsAfterDeserialization(allKnownMods); + _allModsOneIsNterminus = GetModsAfterDeserialization(allKnownMods); NumFixedMods = numFixedMods; _digestionParams = digestionParams; + Description = description; if (n != null) Parent = n; @@ -303,9 +304,9 @@ public IBioPolymerWithSetMods Localize(int j, double massToLocalize) return peptideWithLocalizedMass; } - private void GetModsAfterDeserialization(Dictionary idToMod) + private Dictionary GetModsAfterDeserialization(Dictionary idToMod) { - _allModsOneIsNterminus = new Dictionary(); + var mods = new Dictionary(); int currentModStart = 0; int currentModificationLocation = 1; bool currentlyReadingMod = false; @@ -355,7 +356,7 @@ private void GetModsAfterDeserialization(Dictionary idToMo currentModificationLocation = BaseSequence.Length + 2; } - _allModsOneIsNterminus.Add(currentModificationLocation, mod); + mods.Add(currentModificationLocation, mod); currentlyReadingMod = false; } } @@ -365,6 +366,8 @@ private void GetModsAfterDeserialization(Dictionary idToMo } //else do nothing } + + return mods; } } } From 5ec870db530937569d1beecbec0d78b6f03936be Mon Sep 17 00:00:00 2001 From: nbollis Date: Wed, 25 Sep 2024 12:19:36 -0500 Subject: [PATCH 11/17] Broke out TerminusSpecificProductTypes class and removed unnecessary namespaces --- .../Oligo/DissociationTypeCollection.cs | 109 +------------- .../Oligo/TerminusSpecificProductTypes.cs | 141 ++++++++++++++++++ mzLib/Test/Transcriptomics/TestProductType.cs | 2 +- mzLib/Transcriptomics/ClassExtensions.cs | 4 - .../Interfaces/INucleicAcid.cs | 1 - mzLib/Transcriptomics/NucleicAcid.cs | 4 - mzLib/Transcriptomics/RNA.cs | 5 - 7 files changed, 148 insertions(+), 118 deletions(-) create mode 100644 mzLib/Omics/Fragmentation/Oligo/TerminusSpecificProductTypes.cs diff --git a/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs b/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs index 7b5a411ee..4302fadcb 100644 --- a/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs +++ b/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs @@ -100,6 +100,12 @@ public static class DissociationTypeCollection public static List GetRnaProductTypesFromDissociationType(this DissociationType dissociationType) => ProductsFromDissociationType[dissociationType]; + /// + /// Returns mass shift by product type + /// + /// + /// + public static double GetRnaMassShiftFromProductType(this ProductType type) => FragmentIonCaps[type].MonoisotopicMass; /// /// Mass to be added or subtracted @@ -138,109 +144,6 @@ public static List GetRnaProductTypesFromDissociationType(this Diss { ProductType.M, new ChemicalFormula() } }; - /// - /// Returns mass shift by product type - /// - /// - /// - public static double GetRnaMassShiftFromProductType(this ProductType type) => FragmentIonCaps[type].MonoisotopicMass; - - public static FragmentationTerminus GetRnaTerminusType(this ProductType fragmentType) - { - switch (fragmentType) - { - case ProductType.a: - case ProductType.aWaterLoss: - case ProductType.aBaseLoss: - case ProductType.b: - case ProductType.bWaterLoss: - case ProductType.bBaseLoss: - case ProductType.c: - case ProductType.cWaterLoss: - case ProductType.cBaseLoss: - case ProductType.d: - case ProductType.dWaterLoss: - case ProductType.dBaseLoss: - return FragmentationTerminus.FivePrime; - - case ProductType.w: - case ProductType.wWaterLoss: - case ProductType.wBaseLoss: - case ProductType.x: - case ProductType.xWaterLoss: - case ProductType.xBaseLoss: - case ProductType.y: - case ProductType.yWaterLoss: - case ProductType.yBaseLoss: - case ProductType.z: - case ProductType.zWaterLoss: - case ProductType.zBaseLoss: - return FragmentationTerminus.ThreePrime; - - case ProductType.M: - return FragmentationTerminus.None; - - case ProductType.aStar: - case ProductType.aDegree: - case ProductType.bAmmoniaLoss: - case ProductType.yAmmoniaLoss: - case ProductType.zPlusOne: - case ProductType.D: - case ProductType.Ycore: - case ProductType.Y: - default: - throw new ArgumentOutOfRangeException(nameof(fragmentType), fragmentType, null); - } - } - - /// - /// Product ion types by Fragmentation Terminus - /// - private static readonly Dictionary> - ProductIonTypesFromSpecifiedTerminus = new Dictionary> - { - { - FragmentationTerminus.FivePrime, new List - { - ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, - ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, - ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, - ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, - } - }, - { - FragmentationTerminus.ThreePrime, new List - { - ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, - ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, - ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, - ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, - } - }, - { - FragmentationTerminus.Both, new List - { - - ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, - ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, - ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, - ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, - ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, - ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, - ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, - ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, - ProductType.M - } - } - }; - - - public static List GetRnaTerminusSpecificProductTypes( - this FragmentationTerminus fragmentationTerminus) - { - return ProductIonTypesFromSpecifiedTerminus[fragmentationTerminus]; - } - /// /// Returns all product ion types based upon specified terminus /// diff --git a/mzLib/Omics/Fragmentation/Oligo/TerminusSpecificProductTypes.cs b/mzLib/Omics/Fragmentation/Oligo/TerminusSpecificProductTypes.cs new file mode 100644 index 000000000..0ec5541cd --- /dev/null +++ b/mzLib/Omics/Fragmentation/Oligo/TerminusSpecificProductTypes.cs @@ -0,0 +1,141 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Omics.Fragmentation.Oligo +{ + public static class TerminusSpecificProductTypes + { + public static List GetRnaTerminusSpecificProductTypes( + this FragmentationTerminus fragmentationTerminus) + { + return ProductIonTypesFromSpecifiedTerminus[fragmentationTerminus]; + } + + /// + /// The types of ions that can be generated from an oligo fragment, based on the terminus of the fragment + /// + public static Dictionary> ProductIonTypesFromSpecifiedTerminus = new Dictionary> + { + { + FragmentationTerminus.FivePrime, new List + { + ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, + ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, + ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, + ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, + } + }, + { + FragmentationTerminus.ThreePrime, new List + { + ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, + ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, + ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, + ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, + } + }, + { + FragmentationTerminus.Both, new List + { + + ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, + ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, + ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, + ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, + ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, + ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, + ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, + ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, + ProductType.M + } + + }, + { + FragmentationTerminus.None, new List() + } + }; + + + public static FragmentationTerminus GetRnaTerminusType(this ProductType fragmentType) + { + switch (fragmentType) + { + case ProductType.a: + case ProductType.aWaterLoss: + case ProductType.aBaseLoss: + case ProductType.b: + case ProductType.bWaterLoss: + case ProductType.bBaseLoss: + case ProductType.c: + case ProductType.cWaterLoss: + case ProductType.cBaseLoss: + case ProductType.d: + case ProductType.dWaterLoss: + case ProductType.dBaseLoss: + case ProductType.w: + case ProductType.wWaterLoss: + case ProductType.wBaseLoss: + case ProductType.x: + case ProductType.xWaterLoss: + case ProductType.xBaseLoss: + case ProductType.y: + case ProductType.yWaterLoss: + case ProductType.yBaseLoss: + case ProductType.z: + case ProductType.zWaterLoss: + case ProductType.zBaseLoss: + case ProductType.M: + return ProductTypeToFragmentationTerminus[fragmentType]; + + case ProductType.aStar: + case ProductType.aDegree: + case ProductType.bAmmoniaLoss: + case ProductType.yAmmoniaLoss: + case ProductType.zPlusOne: + case ProductType.D: + case ProductType.Ycore: + case ProductType.Y: + default: + throw new ArgumentOutOfRangeException(nameof(fragmentType), fragmentType, null); + } + } + + + /// + /// The terminus of the oligo fragment that the product ion is generated from + /// + public static Dictionary ProductTypeToFragmentationTerminus = new Dictionary + { + { ProductType.a, FragmentationTerminus.FivePrime }, + { ProductType.aWaterLoss, FragmentationTerminus.FivePrime }, + { ProductType.aBaseLoss, FragmentationTerminus.FivePrime }, + { ProductType.b, FragmentationTerminus.FivePrime }, + { ProductType.bWaterLoss, FragmentationTerminus.FivePrime }, + { ProductType.bBaseLoss, FragmentationTerminus.FivePrime }, + { ProductType.c, FragmentationTerminus.FivePrime }, + { ProductType.cWaterLoss, FragmentationTerminus.FivePrime }, + { ProductType.cBaseLoss, FragmentationTerminus.FivePrime }, + { ProductType.d, FragmentationTerminus.FivePrime }, + { ProductType.dWaterLoss, FragmentationTerminus.FivePrime }, + { ProductType.dBaseLoss, FragmentationTerminus.FivePrime }, + + { ProductType.w, FragmentationTerminus.ThreePrime }, + { ProductType.wWaterLoss, FragmentationTerminus.ThreePrime }, + { ProductType.wBaseLoss, FragmentationTerminus.ThreePrime }, + { ProductType.x, FragmentationTerminus.ThreePrime }, + { ProductType.xWaterLoss, FragmentationTerminus.ThreePrime }, + { ProductType.xBaseLoss, FragmentationTerminus.ThreePrime }, + { ProductType.y, FragmentationTerminus.ThreePrime }, + { ProductType.yWaterLoss, FragmentationTerminus.ThreePrime }, + { ProductType.yBaseLoss, FragmentationTerminus.ThreePrime }, + { ProductType.z, FragmentationTerminus.ThreePrime }, + { ProductType.zWaterLoss, FragmentationTerminus.ThreePrime }, + { ProductType.zBaseLoss, FragmentationTerminus.ThreePrime }, + + { ProductType.M, FragmentationTerminus.Both } + }; + } +} diff --git a/mzLib/Test/Transcriptomics/TestProductType.cs b/mzLib/Test/Transcriptomics/TestProductType.cs index f9c459211..15757f4d2 100644 --- a/mzLib/Test/Transcriptomics/TestProductType.cs +++ b/mzLib/Test/Transcriptomics/TestProductType.cs @@ -228,7 +228,7 @@ public void TestProductTypes_GetRnaTerminusType() break; case ProductType.M: - Assert.That(type.GetRnaTerminusType(), Is.EqualTo(FragmentationTerminus.None)); + Assert.That(type.GetRnaTerminusType(), Is.EqualTo(FragmentationTerminus.Both)); break; case ProductType.aStar: diff --git a/mzLib/Transcriptomics/ClassExtensions.cs b/mzLib/Transcriptomics/ClassExtensions.cs index a5617863e..4a6a52962 100644 --- a/mzLib/Transcriptomics/ClassExtensions.cs +++ b/mzLib/Transcriptomics/ClassExtensions.cs @@ -1,9 +1,5 @@ using Omics.Modifications; -using System; -using System.Collections.Generic; -using System.Linq; using System.Text; -using System.Threading.Tasks; using Transcriptomics.Digestion; namespace Transcriptomics diff --git a/mzLib/Transcriptomics/Interfaces/INucleicAcid.cs b/mzLib/Transcriptomics/Interfaces/INucleicAcid.cs index d2052aee3..4e3e95e4d 100644 --- a/mzLib/Transcriptomics/Interfaces/INucleicAcid.cs +++ b/mzLib/Transcriptomics/Interfaces/INucleicAcid.cs @@ -1,5 +1,4 @@ using Chemistry; -using Omics; using Omics.Modifications; namespace Transcriptomics diff --git a/mzLib/Transcriptomics/NucleicAcid.cs b/mzLib/Transcriptomics/NucleicAcid.cs index db6f18f43..c99934f75 100644 --- a/mzLib/Transcriptomics/NucleicAcid.cs +++ b/mzLib/Transcriptomics/NucleicAcid.cs @@ -2,11 +2,7 @@ using Omics.Digestion; using Omics.Modifications; using Omics; -using System; -using System.Collections.Generic; -using System.Linq; using System.Text; -using System.Threading.Tasks; using Transcriptomics.Digestion; namespace Transcriptomics diff --git a/mzLib/Transcriptomics/RNA.cs b/mzLib/Transcriptomics/RNA.cs index 3e72c1f14..af6798cea 100644 --- a/mzLib/Transcriptomics/RNA.cs +++ b/mzLib/Transcriptomics/RNA.cs @@ -1,10 +1,5 @@ using Chemistry; using Omics.Modifications; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; namespace Transcriptomics { From bafa5179f4de022f64d380674d1035985d0adefa Mon Sep 17 00:00:00 2001 From: Nic Bollis Date: Fri, 27 Sep 2024 09:51:04 -0500 Subject: [PATCH 12/17] Update ProteinXmlEntry.cs --- mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs index 698a1c51c..3e9bb5a34 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs @@ -406,19 +406,6 @@ private static void ParseAnnotatedMods(Dictionary> desti } } - private static ModificationMotif GetMotif(string proteinSequence, int position) - { - string aminoAcid = proteinSequence.Substring(position - 1, 1); - if (ModificationMotif.TryGetMotif(aminoAcid, out ModificationMotif motif)) - { - return motif; - } - else - { - return null; - } - } - /// /// Finish parsing a database reference element /// From 0bbad42065826bd239794a9d6580d5a9e7dc51a8 Mon Sep 17 00:00:00 2001 From: Nic Bollis Date: Fri, 27 Sep 2024 10:19:52 -0500 Subject: [PATCH 13/17] Added gene name to RNA constructore --- mzLib/Transcriptomics/NucleicAcid.cs | 3 ++- mzLib/Transcriptomics/RNA.cs | 4 ++-- mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/mzLib/Transcriptomics/NucleicAcid.cs b/mzLib/Transcriptomics/NucleicAcid.cs index c99934f75..2cc9de6c0 100644 --- a/mzLib/Transcriptomics/NucleicAcid.cs +++ b/mzLib/Transcriptomics/NucleicAcid.cs @@ -60,7 +60,7 @@ protected NucleicAcid(string sequence, IHasChemicalFormula? fivePrimeTerm = null protected NucleicAcid(string sequence, string name, string identifier, string organism, string databaseFilePath, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null, IDictionary>? oneBasedPossibleLocalizedModifications = null, - bool isContaminant = false, bool isDecoy = false, + bool isContaminant = false, bool isDecoy = false, List>? geneNames = null, Dictionary? additionalDatabaseFields = null) : this(sequence, fivePrimeTerm, threePrimeTerm, oneBasedPossibleLocalizedModifications) { @@ -71,6 +71,7 @@ protected NucleicAcid(string sequence, string name, string identifier, string or Organism = organism; Accession = identifier; AdditionalDatabaseFields = additionalDatabaseFields; + GeneNames = geneNames ?? new List>(); } #endregion diff --git a/mzLib/Transcriptomics/RNA.cs b/mzLib/Transcriptomics/RNA.cs index af6798cea..5d5fcb2f6 100644 --- a/mzLib/Transcriptomics/RNA.cs +++ b/mzLib/Transcriptomics/RNA.cs @@ -35,10 +35,10 @@ public RNA(string sequence, IHasChemicalFormula? fivePrimeTerm = null, IHasChemi public RNA(string sequence, string name, string identifier, string organism, string databaseFilePath, IHasChemicalFormula? fivePrimeTerminus = null, IHasChemicalFormula? threePrimeTerminus = null, IDictionary>? oneBasedPossibleModifications = null, - bool isContaminant = false, bool isDecoy = false, + bool isContaminant = false, bool isDecoy = false, List> geneNames = null, Dictionary? databaseAdditionalFields = null) : base(sequence, name, identifier, organism, databaseFilePath, fivePrimeTerminus, threePrimeTerminus, - oneBasedPossibleModifications, isContaminant, isDecoy, databaseAdditionalFields) + oneBasedPossibleModifications, isContaminant, isDecoy, geneNames, databaseAdditionalFields) { } diff --git a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs index 3e9bb5a34..becfa2cfa 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs @@ -248,7 +248,7 @@ internal RNA ParseRnaEntryEndElement(XmlReader xml, bool isContaminant, string r ParseAnnotatedMods(OneBasedModifications, modTypesToExclude, unknownModifications, AnnotatedMods); result = new RNA(Sequence, Name, Accession, Organism, rnaDbLocation, null, - null, OneBasedModifications, isContaminant, false, null); + null, OneBasedModifications, isContaminant, false, GeneNames, null); } Clear(); return result; From 5e852a1ea492f73702768820036f102168a4f603 Mon Sep 17 00:00:00 2001 From: Nic Bollis Date: Fri, 27 Sep 2024 10:19:52 -0500 Subject: [PATCH 14/17] Added gene name to RNA constructore --- mzLib/Test/Transcriptomics/TestDecoyGenerator.cs | 2 +- mzLib/Transcriptomics/ClassExtensions.cs | 3 ++- mzLib/Transcriptomics/NucleicAcid.cs | 3 ++- mzLib/Transcriptomics/RNA.cs | 4 ++-- mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs | 2 +- .../UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs | 2 +- 6 files changed, 9 insertions(+), 7 deletions(-) diff --git a/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs b/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs index 800126b1a..1b81a0d5a 100644 --- a/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs +++ b/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs @@ -240,7 +240,7 @@ public void TestCreateNew() }; var rna = new RNA("GAACUG", "name", "accession", "organism", "databaseFilePath", - null, null, oneBasedPossibleLocalizedModifications, false, false, + null, null, oneBasedPossibleLocalizedModifications, false, false, new List>(), new Dictionary()); var oligos = rna .Digest(new RnaDigestionParams(maxMods: 1), new List(), mods) diff --git a/mzLib/Transcriptomics/ClassExtensions.cs b/mzLib/Transcriptomics/ClassExtensions.cs index 4a6a52962..ef56c737d 100644 --- a/mzLib/Transcriptomics/ClassExtensions.cs +++ b/mzLib/Transcriptomics/ClassExtensions.cs @@ -35,7 +35,7 @@ public static T CreateNew(this T target, string? sequence = null, IDictionary { bool newIsDecoy = isDecoy ?? rna.IsDecoy; returnObj = new RNA(newSequence, rna.Name, rna.Accession, rna.Organism, rna.DatabaseFilePath, - rna.FivePrimeTerminus, rna.ThreePrimeTerminus, newModifications, rna.IsContaminant, newIsDecoy, rna.AdditionalDatabaseFields); + rna.FivePrimeTerminus, rna.ThreePrimeTerminus, newModifications, rna.IsContaminant, newIsDecoy, rna.GeneNames.ToList(), rna.AdditionalDatabaseFields); break; } case OligoWithSetMods oligo: @@ -53,6 +53,7 @@ public static T CreateNew(this T target, string? sequence = null, IDictionary newModifications, oldParent.IsContaminant, newIsDecoy, + oldParent.GeneNames.ToList(), oldParent.AdditionalDatabaseFields); returnObj = new OligoWithSetMods( diff --git a/mzLib/Transcriptomics/NucleicAcid.cs b/mzLib/Transcriptomics/NucleicAcid.cs index c99934f75..2cc9de6c0 100644 --- a/mzLib/Transcriptomics/NucleicAcid.cs +++ b/mzLib/Transcriptomics/NucleicAcid.cs @@ -60,7 +60,7 @@ protected NucleicAcid(string sequence, IHasChemicalFormula? fivePrimeTerm = null protected NucleicAcid(string sequence, string name, string identifier, string organism, string databaseFilePath, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null, IDictionary>? oneBasedPossibleLocalizedModifications = null, - bool isContaminant = false, bool isDecoy = false, + bool isContaminant = false, bool isDecoy = false, List>? geneNames = null, Dictionary? additionalDatabaseFields = null) : this(sequence, fivePrimeTerm, threePrimeTerm, oneBasedPossibleLocalizedModifications) { @@ -71,6 +71,7 @@ protected NucleicAcid(string sequence, string name, string identifier, string or Organism = organism; Accession = identifier; AdditionalDatabaseFields = additionalDatabaseFields; + GeneNames = geneNames ?? new List>(); } #endregion diff --git a/mzLib/Transcriptomics/RNA.cs b/mzLib/Transcriptomics/RNA.cs index af6798cea..5d5fcb2f6 100644 --- a/mzLib/Transcriptomics/RNA.cs +++ b/mzLib/Transcriptomics/RNA.cs @@ -35,10 +35,10 @@ public RNA(string sequence, IHasChemicalFormula? fivePrimeTerm = null, IHasChemi public RNA(string sequence, string name, string identifier, string organism, string databaseFilePath, IHasChemicalFormula? fivePrimeTerminus = null, IHasChemicalFormula? threePrimeTerminus = null, IDictionary>? oneBasedPossibleModifications = null, - bool isContaminant = false, bool isDecoy = false, + bool isContaminant = false, bool isDecoy = false, List> geneNames = null, Dictionary? databaseAdditionalFields = null) : base(sequence, name, identifier, organism, databaseFilePath, fivePrimeTerminus, threePrimeTerminus, - oneBasedPossibleModifications, isContaminant, isDecoy, databaseAdditionalFields) + oneBasedPossibleModifications, isContaminant, isDecoy, geneNames, databaseAdditionalFields) { } diff --git a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs index 3e9bb5a34..becfa2cfa 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs @@ -248,7 +248,7 @@ internal RNA ParseRnaEntryEndElement(XmlReader xml, bool isContaminant, string r ParseAnnotatedMods(OneBasedModifications, modTypesToExclude, unknownModifications, AnnotatedMods); result = new RNA(Sequence, Name, Accession, Organism, rnaDbLocation, null, - null, OneBasedModifications, isContaminant, false, null); + null, OneBasedModifications, isContaminant, false, GeneNames, null); } Clear(); return result; diff --git a/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs index 13aa82fb2..2e80c090c 100644 --- a/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs +++ b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs @@ -144,7 +144,7 @@ public static List LoadRnaFasta(string rnaDbLocation, bool generateTargets, RNA rna = new RNA(sequence, name, identifier, organism, rnaDbLocation, fivePrimeTerm, threePrimeTerm, null, - isContaminant, false, additonalDatabaseFields); + isContaminant, false, null, additonalDatabaseFields); if (rna.Length == 0) errors.Add("Line" + line + ", Rna length of 0: " + rna.Name + "was skipped from database: " + rnaDbLocation); else From 501ef559ac683764732db7be5ee9616c209fc20d Mon Sep 17 00:00:00 2001 From: nbollis Date: Tue, 1 Oct 2024 18:34:33 -0500 Subject: [PATCH 15/17] Refactor and enhance exception handling and tests Refactored constructors, improved exception handling, and added comprehensive tests across multiple files. Key changes include: - `MzLibException.cs`: Updated constructor to include `innerException`. - `TestDecoyGenerator.cs`: Added assertions for `CreateNew` method. - `TestDigestion.cs`: Added assertions and new test for RNA digestion exception. - Refactored modification lists and added various tests for modifications. - `TestNucleicAcid.cs`: Refactored methods, adjusted precision, and updated terminus assignments. - `NucleolyticOligo.cs`: Changed parameter types, updated comments, and improved variable names. - `OligoWithSetMods.cs`: Enhanced exception messages and updated modification location checks. - `NucleicAcid.cs`: Added `using` directive, changed exception type, and refactored methods. - `mzLib.sln.DotSettings`: Updated user dictionary entries. --- mzLib/MzLibUtil/MzLibException.cs | 9 +- .../Transcriptomics/TestDecoyGenerator.cs | 17 +- mzLib/Test/Transcriptomics/TestDigestion.cs | 418 ++++++++++++++++-- mzLib/Test/Transcriptomics/TestNucleicAcid.cs | 21 +- .../Digestion/NucleolyticOligo.cs | 26 +- .../Digestion/OligoWithSetMods.cs | 4 +- mzLib/Transcriptomics/NucleicAcid.cs | 14 +- mzLib/mzLib.sln.DotSettings | 1 + 8 files changed, 443 insertions(+), 67 deletions(-) diff --git a/mzLib/MzLibUtil/MzLibException.cs b/mzLib/MzLibUtil/MzLibException.cs index cf86074d8..61ecc8d6b 100644 --- a/mzLib/MzLibUtil/MzLibException.cs +++ b/mzLib/MzLibUtil/MzLibException.cs @@ -3,11 +3,6 @@ namespace MzLibUtil { [Serializable] - public class MzLibException : Exception - { - public MzLibException(string message) - : base(message) - { - } - } + public class MzLibException(string message, Exception innerException = null) + : Exception(message, innerException); } \ No newline at end of file diff --git a/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs b/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs index 800126b1a..8a098e45f 100644 --- a/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs +++ b/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs @@ -249,7 +249,6 @@ public void TestCreateNew() var clonedRna = rna.CreateNew(null, null, true); var clonedOligo = oligos.First().CreateNew(null, null, true); - // ensure they are identical except for the isDecoy field // ensure they are identical except for the isDecoy field Assert.That(rna.BaseSequence, Is.EqualTo(clonedRna.BaseSequence)); Assert.That(rna.OneBasedPossibleLocalizedModifications, Is.EqualTo(clonedRna.OneBasedPossibleLocalizedModifications)); @@ -260,6 +259,22 @@ public void TestCreateNew() Assert.That(oligos.First().Parent.IsDecoy, Is.Not.EqualTo(clonedOligo.Parent.IsDecoy)); + var newMods = new Dictionary>() + { + { 1, new List() { modDict["Sodium on A"] } }, + { 2, new List() { modDict["Sodium on A"] } }, + { 3, new List() { modDict["Sodium on A"] } }, + }; + clonedRna = rna.CreateNew("AAAAAA", newMods, null); + clonedOligo = oligos.First().CreateNew("AAAAAA", newMods, null); + + Assert.That(rna.BaseSequence, Is.Not.EqualTo(clonedRna.BaseSequence)); + Assert.That(rna.OneBasedPossibleLocalizedModifications, Is.Not.EqualTo(clonedRna.OneBasedPossibleLocalizedModifications)); + Assert.That(rna.IsDecoy, Is.EqualTo(clonedRna.IsDecoy)); + + Assert.That(oligos.First().BaseSequence, Is.Not.EqualTo(clonedOligo.BaseSequence)); + Assert.That(oligos.First().OneBasedPossibleLocalizedModifications, Is.Not.EqualTo(clonedOligo.OneBasedPossibleLocalizedModifications)); + Assert.That(oligos.First().Parent.IsDecoy, Is.EqualTo(clonedOligo.Parent.IsDecoy)); } } } diff --git a/mzLib/Test/Transcriptomics/TestDigestion.cs b/mzLib/Test/Transcriptomics/TestDigestion.cs index 37abc0447..a46017c0e 100644 --- a/mzLib/Test/Transcriptomics/TestDigestion.cs +++ b/mzLib/Test/Transcriptomics/TestDigestion.cs @@ -512,6 +512,7 @@ public static void OligoWithSetMods_CalculatedValues() Assert.That(oligoWithSetMods.NumMods, Is.EqualTo(1)); Assert.That(oligoWithSetMods.NumFixedMods, Is.EqualTo(1)); Assert.That(oligoWithSetMods.NumVariableMods, Is.EqualTo(0)); + Assert.That(oligoWithSetMods.CleavageSpecificityForFdrCategory, Is.EqualTo(CleavageSpecificity.Full)); var formula = oligoWithSetMods.ThisChemicalFormula; Assert.That(formula, Is.EqualTo(rnaFormula + sodiumAdduct.ChemicalFormula)); @@ -580,6 +581,7 @@ public void TestDigestionParamsClone() Assert.That(digestionParams.MaxLength, Is.EqualTo(cloned.MaxLength)); Assert.That(digestionParams.MaxMods, Is.EqualTo(cloned.MaxMods)); Assert.That(digestionParams.FragmentationTerminus, Is.Not.EqualTo(cloned.FragmentationTerminus)); + Assert.That(digestionParams.SearchModeType, Is.EqualTo(CleavageSpecificity.Full)); Assert.That(cloned.FragmentationTerminus, Is.EqualTo(FragmentationTerminus.C)); // do not set new terminus, all values are retained @@ -590,6 +592,7 @@ public void TestDigestionParamsClone() Assert.That(digestionParams.MaxLength, Is.EqualTo(cloned.MaxLength)); Assert.That(digestionParams.MaxMods, Is.EqualTo(cloned.MaxMods)); Assert.That(digestionParams.FragmentationTerminus, Is.EqualTo(cloned.FragmentationTerminus)); + Assert.That(digestionParams.SearchModeType, Is.EqualTo(CleavageSpecificity.Full)); Assert.That(cloned.FragmentationTerminus, Is.EqualTo(FragmentationTerminus.Both)); } @@ -651,25 +654,53 @@ public void TestNucleicAcid_Digestion_WithoutMods_MonoMasses(RnaDigestionTestCas } } + [Test] + public static void TestNucleicAcid_Digestion_Exception() + { + IDigestionParams digestionParams = new Proteomics.ProteolyticDigestion.DigestionParams(); + var rna = new RNA("GUACUGGUACUG"); + + try + { + var result = rna.Digest(digestionParams, new List(), new List()); + } + catch (Exception e) + { + Assert.That(e, Is.TypeOf()); + Assert.That(e.InnerException, Is.TypeOf()); + } + } + #endregion #region Digestion with Modifications + public static List SodiumAdducts => + PtmListLoader.ReadModsFromString("ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A or C or G or U\r\nCF Na1H-1\r\n" + @"//", + out List<(Modification, string)> mods).ToList(); + + public static List PotassiumAdducts => + PtmListLoader.ReadModsFromString("ID Potassium\r\nMT Metal\r\nPP Anywhere.\r\nTG A or C or G or U\r\nCF K1H-1\r\n" + @"//", + out List<(Modification, string)> mods).ToList(); + + public static List TerminalSodiumAdducts => + PtmListLoader.ReadModsFromString("ID Sodium\r\nMT Metal\r\nPP 3'-terminal.\r\nTG A or C or G or U\r\nCF Na1H-1\r\n" + @"//", + out List<(Modification, string)> mods).ToList(); + + public static List TerminalPotassiumAdducts => + PtmListLoader.ReadModsFromString("ID Potassium\r\nMT Metal\r\nPP 5'-terminal.\r\nTG A or C or G or U\r\nCF K1H-1\r\n" + @"//", + out List<(Modification, string)> mods).ToList(); + [Test] public static void TestVariableModsCountCorrect() { - string modText = "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A or C or G or U\r\nCF Na1H-1\r\n" + @"//"; - var sodiumAdducts = PtmListLoader.ReadModsFromString(modText, out List<(Modification, string)> mods) - .ToList(); - Assert.That(sodiumAdducts.Count, Is.EqualTo(4)); - var rna = new RNA("GUACUG"); var rnaDigestionParams = new RnaDigestionParams() { MaxMods = 1, }; - var precursors = rna.Digest(rnaDigestionParams, new List(), sodiumAdducts) + var precursors = rna.Digest(rnaDigestionParams, new List(), SodiumAdducts) .ToList(); Assert.That(precursors.Count, Is.EqualTo(7)); var fullSequences = precursors.Select(p => p.FullSequence).ToList(); @@ -682,7 +713,7 @@ public static void TestVariableModsCountCorrect() Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); rnaDigestionParams.MaxMods = 2; - precursors = rna.Digest(rnaDigestionParams, new List(), sodiumAdducts) + precursors = rna.Digest(rnaDigestionParams, new List(), SodiumAdducts) .ToList(); Assert.That(precursors.Count, Is.EqualTo(22)); fullSequences = precursors.Select(p => p.FullSequence).ToList(); @@ -713,9 +744,7 @@ public static void TestVariableModsCountCorrect() [Test] public static void TestFixedModsCountCorrect() { - string modText = "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//"; - var sodiumAdduct = PtmListLoader.ReadModsFromString(modText, out List<(Modification, string)> mods) - .ToList(); + var sodiumAdduct = new List() { SodiumAdducts[0] }; var rna = new RNA("GUACUG"); var rnaDigestionParams = new RnaDigestionParams() @@ -729,9 +758,7 @@ public static void TestFixedModsCountCorrect() Assert.That(precursors.First().FullSequence, Is.EqualTo("GUA[Metal:Sodium on A]CUG")); Assert.That(precursors.First().MonoisotopicMass, Is.EqualTo(1896.26).Within(0.01)); - modText = "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG G\r\nCF Na1H-1\r\n" + @"//"; - sodiumAdduct = PtmListLoader.ReadModsFromString(modText, out mods) - .ToList(); + sodiumAdduct = new List() { SodiumAdducts[2] }; precursors = rna.Digest(rnaDigestionParams, sodiumAdduct, new List()) .ToList(); @@ -744,22 +771,12 @@ public static void TestFixedModsCountCorrect() [Test] public static void TestFixedAndVariableMods() { - string modText = "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A or C or G or U\r\nCF Na1H-1\r\n" + @"//"; - string modText2 = "ID Potassium\r\nMT Metal\r\nPP Anywhere.\r\nTG A or C or G or U\r\nCF K1H-1\r\n" + @"//"; - var sodiumAdducts = PtmListLoader.ReadModsFromString(modText, out List<(Modification, string)> mods) - .ToList(); - var potassiumAdducts = PtmListLoader.ReadModsFromString(modText2, out mods) - .ToList(); - - Assert.That(sodiumAdducts.Count, Is.EqualTo(4)); - Assert.That(potassiumAdducts.Count, Is.EqualTo(4)); - var rna = new RNA("GUACUG"); var rnaDigestionParams = new RnaDigestionParams(); rnaDigestionParams.MaxMods = 1; - var fixedMods = new List { potassiumAdducts[0] }; // A - var variableMods = new List { sodiumAdducts[1] }; // C + var fixedMods = new List { PotassiumAdducts[0] }; // A + var variableMods = new List { SodiumAdducts[1] }; // C var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) .ToList(); @@ -774,8 +791,8 @@ public static void TestFixedAndVariableMods() Assert.That(oneOfEach.NumVariableMods, Is.EqualTo(1)); Assert.That(oneOfEach.NumMods, Is.EqualTo(2)); - fixedMods = new List { potassiumAdducts[2] }; // G - variableMods = new List { sodiumAdducts[1] }; // C + fixedMods = new List { PotassiumAdducts[2] }; // G + variableMods = new List { SodiumAdducts[1] }; // C precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) .ToList(); fullSequences = precursors.Select(p => p.FullSequence).ToList(); @@ -784,8 +801,8 @@ public static void TestFixedAndVariableMods() Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UAC[Metal:Sodium on C]UG[Metal:Potassium on G]")); - fixedMods = new List { potassiumAdducts[2] }; // G - variableMods = new List { sodiumAdducts[1], sodiumAdducts[3] }; // C, U + fixedMods = new List { PotassiumAdducts[2] }; // G + variableMods = new List { SodiumAdducts[1], SodiumAdducts[3] }; // C, U precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) .ToList(); fullSequences = precursors.Select(p => p.FullSequence).ToList(); @@ -811,6 +828,349 @@ public static void TestFixedAndVariableMods() Assert.That(fullSequences.Contains("G[Metal:Potassium on G]U[Metal:Sodium on U]AC[Metal:Sodium on C]UG[Metal:Potassium on G]")); } + /// + /// Test when one fixed and one variable mod are used and share a localization + /// expect two results, one with the fixed, and one with the variable + /// + [Test] + public static void TestFixedAndVariableMods_LocalizationOverlap() + { + var rna = new RNA("GUACUG"); + var rnaDigestionParams = new RnaDigestionParams(); + + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { PotassiumAdducts[1] }; // C + var variableMods = new List { SodiumAdducts[1] }; // C + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(2)); + Assert.That(precursors.Any(p => p.NumFixedMods == 1)); + Assert.That(precursors.Any(p => p.NumVariableMods == 1)); + Assert.That(precursors.Any(p => p.NumFixedMods == 0)); + Assert.That(precursors.Any(p => p.NumVariableMods == 0)); + Assert.That(precursors.All(p => p.NumMods == 1)); + Assert.That(fullSequences.Contains("GUAC[Metal:Potassium on C]UG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG")); + } + } + + /// + /// Test when two variable mods are used and share a localization + /// expect three results, one unmodified, and two singly modified + /// + [Test] + public static void TestVariableMods_LocalizationOverlap() + { + var rna = new RNA("GUACUG"); + var rnaDigestionParams = new RnaDigestionParams(); + + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; // C + var variableMods = new List { PotassiumAdducts[1], SodiumAdducts[1] }; + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + // expect three results, one unmodified, and two singly modified + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(3)); + Assert.That(precursors.Any(p => p.NumFixedMods == 0)); + Assert.That(precursors.Any(p => p.NumVariableMods == 1)); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Potassium on C]UG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG")); + } + } + + /// + /// Test when one modification is annotated in the database, out of bounds + /// expect two results, one with the fixed, and one with the variable + /// + [Test] + public static void TestDatabaseAnnotatedMods_OutOfBounds() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 23, new List() { PotassiumAdducts[1] } } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; // C + var variableMods = new List { }; // C + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(1)); + Assert.That(precursors.All(p => p.NumFixedMods == 0)); + Assert.That(precursors.All(p => p.NumVariableMods == 0)); + Assert.That(precursors.All(p => p.NumMods == 0)); + Assert.That(fullSequences.Contains("GUACUG")); + } + } + + /// + /// Test when one modification is annotated in the database + /// expect two results, one unmodified, and one singly modified + /// + [Test] + public static void TestDatabaseAnnotatedMods_SingleModification() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 4, new List() { PotassiumAdducts[1] } } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; // C + var variableMods = new List { }; + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(2)); + Assert.That(precursors[0].NumMods, Is.EqualTo(0)); + Assert.That(precursors[1].NumMods, Is.EqualTo(1)); + Assert.That(precursors[1].NumVariableMods, Is.EqualTo(1)); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Potassium on C]UG")); + } + } + + /// + /// Test when two modifications are annotated in the database at the same location + /// expect three results, one unmodified, and two singly modified + /// + [Test] + public static void TestDatabaseAnnotatedMods_LocalizationOverlap() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 4, new List() { PotassiumAdducts[1], SodiumAdducts[1] } } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; + var variableMods = new List { }; + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(3)); + Assert.That(precursors.Any(p => p.NumFixedMods == 0)); + Assert.That(precursors.Any(p => p.NumVariableMods == 1)); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Potassium on C]UG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG")); + } + } + + /// + /// Test when two terminal modifications are annotated in the database + /// MaxMods 1: expect three results, one unmodified, and two singly modified + /// MaxMods 2: expect four results, one unmodified, and two singly modified, and one double modified + /// + [Test] + public static void TestDatabaseAnnotatedMods_TerminalMods() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 1, new List() { TerminalPotassiumAdducts[2]} }, + { 6, new List() { TerminalSodiumAdducts[2]} } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + // Test when two terminal modifications are annotated in the database + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; + var variableMods = new List { }; + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + // expect three results, one unmodified, and two singly modified + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(2 + i)); + Assert.That(precursors.Any(p => p.NumFixedMods == 0)); + Assert.That(precursors.Any(p => p.NumVariableMods == 1)); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + + if (rnaDigestionParams.MaxMods != 2) continue; + Assert.That(precursors.Any(p => p.NumVariableMods == 2)); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]")); + } + } + + /// + /// Test when two terminal modifications are annotated in the database and one database mod on first residue + /// MaxMods 1: expect four results, one unmodified, and three singly modified + /// MaxMods 2: expect seven results, one unmodified, and three singly modified, and three double modified + /// MaxMods 3: expect eight results, one unmodified, and three singly modified, and three double modified, and one triply modified + /// + [Test] + public static void TestDatabaseAnnotatedMods_TerminalMods_WithFirstResidueDatabaseMod() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 1, new List() { TerminalPotassiumAdducts[2], PotassiumAdducts[2] } }, + { 6, new List() { TerminalSodiumAdducts[2]} } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + // Test when two terminal modifications are annotated in the database and one database mod on first residue + for (int i = 1; i < 4; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; + var variableMods = new List { }; + + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.All(p => p.NumFixedMods == 0)); + + switch (rnaDigestionParams.MaxMods) + { + case 1: + Assert.That(precursors.Count(), Is.EqualTo(4)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods == 1)); + break; + case 2: + Assert.That(precursors.Count(), Is.EqualTo(7)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods >= 1)); + break; + + case 3: + Assert.That(precursors.Count(), Is.EqualTo(8)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods >= 1)); + break; + } + + if (rnaDigestionParams.MaxMods >= 1) + { + + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + } + else if (rnaDigestionParams.MaxMods >= 2) + { + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG")); + } + else if (rnaDigestionParams.MaxMods >= 3) + { + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); + } + } + } + + /// + /// Test when two terminal modifications are annotated in the database and one database mod on first residue + /// MaxMods 1: expect five results, one unmodified, and four singly modified + /// MaxMods 2: expect eleven results, one unmodified, and four singly modified, and six double modified + /// MaxMods 3: expect fifteen results, one unmodified, and four singly modified, and six double modified, and four triply modified + /// + [Test] + public static void TestDatabaseAnnotatedMods_TerminalMods_WithFirstResidueVariableMod() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 1, new List() { TerminalPotassiumAdducts[2] } }, + { 6, new List() { TerminalSodiumAdducts[2]} } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + // Test when two terminal modifications are annotated in the database and one database mod on first residue + for (int i = 1; i < 4; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; + var variableMods = new List { PotassiumAdducts[2] }; + + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.All(p => p.NumFixedMods == 0)); + + switch (rnaDigestionParams.MaxMods) + { + case 1: + Assert.That(precursors.Count(), Is.EqualTo(5)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods == 1)); + break; + case 2: + Assert.That(precursors.Count(), Is.EqualTo(11)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods >= 1)); + break; + + case 3: + Assert.That(precursors.Count(), Is.EqualTo(15)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods >= 1)); + break; + } + + if (rnaDigestionParams.MaxMods >= 1) + { + + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG")); + } + else if (rnaDigestionParams.MaxMods >= 2) + { + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Potassium on G][Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Potassium on G]")); + } + else if (rnaDigestionParams.MaxMods >= 3) + { + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); + + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G][Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Potassium on G][Metal:Sodium on G]")); + } + } + } + + + + #endregion } } diff --git a/mzLib/Test/Transcriptomics/TestNucleicAcid.cs b/mzLib/Test/Transcriptomics/TestNucleicAcid.cs index a0c5619c9..47e98d708 100644 --- a/mzLib/Test/Transcriptomics/TestNucleicAcid.cs +++ b/mzLib/Test/Transcriptomics/TestNucleicAcid.cs @@ -93,6 +93,9 @@ public void TestConstructorsAndEquality(string sequence, double monoMass) CollectionAssert.AreEqual(rna.NucleicAcidArray.Select(p => p.Letter), sequence); Assert.That(rna.FivePrimeTerminus.Equals(NucleicAcid.DefaultFivePrimeTerminus)); Assert.That(rna.ThreePrimeTerminus.Equals(NucleicAcid.DefaultThreePrimeTerminus)); + rna.ThreePrimeTerminus = rna.ThreePrimeTerminus; + Assert.That(rna.ThreePrimeTerminus.Equals(NucleicAcid.DefaultThreePrimeTerminus)); + List nucList = new(); foreach (var nucleotide in sequence) { @@ -145,11 +148,11 @@ public void TestElectroSpraySeries(string sequence, int[] charges, double[] mzs) { RNA rna = new(sequence); - int i = 0; - foreach (var ion in rna.GetElectrospraySeries(charges.First(), charges.Last())) + var esiSeries = rna.GetElectrospraySeries(charges.First(), charges.Last()).ToArray(); + for (int j = 0; j < mzs.Length; j++) { - Assert.That(ion, Is.EqualTo(mzs[i]).Within(0.001)); - i++; + var ion = esiSeries[j]; + Assert.That(ion, Is.EqualTo(mzs[j]).Within(0.01)); } } @@ -158,13 +161,13 @@ public void TestElectroSpraySeries(string sequence, int[] charges, double[] mzs) public void TestReplaceTerminusWithElectroSpraySeries(string sequence, int[] charges, double[] mzs) { RNA rna = new("GUACUG"); - rna.FivePrimeTerminus = new ChemicalFormula(); + rna.FivePrimeTerminus = ChemicalFormula.ParseFormula("H1"); - int i = 0; - foreach (var ion in rna.GetElectrospraySeries(charges.First(), charges.Last())) + var esiSeries = rna.GetElectrospraySeries(charges.Last(), charges.First()).ToArray(); + for (int j = 0; j < mzs.Length; j++) { - Assert.That(ion, Is.EqualTo(mzs[i]).Within(0.001)); - i++; + var ion = esiSeries[j]; + Assert.That(ion, Is.EqualTo(mzs[j]).Within(0.01)); } } } diff --git a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs index a2ad1741b..d2d41cba7 100644 --- a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs +++ b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs @@ -46,7 +46,7 @@ public override string ToString() /// /// Code heavily borrowed from ProteolyticPeptide.GetModifiedPeptides /// - internal IEnumerable GenerateModifiedOligos(IEnumerable allKnownFixedMods, + internal IEnumerable GenerateModifiedOligos(List allKnownFixedMods, RnaDigestionParams digestionParams, List variableModifications) { int oligoLength = OneBasedEndResidue - OneBasedStartResidue + 1; @@ -63,7 +63,7 @@ internal IEnumerable GenerateModifiedOligos(IEnumerable GenerateModifiedOligos(IEnumerable GenerateModifiedOligos(IEnumerable GenerateModifiedOligos(IEnumerable GenerateModifiedOligos(IEnumerable kvp in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForOligo, oligoLength)) + foreach (Dictionary variableModPattern in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForOligo, oligoLength)) { int numFixedMods = 0; - foreach (var ok in GetFixedModsOneIsNorFivePrimeTerminus(oligoLength, allKnownFixedMods)) + foreach (var fixedModPattern in GetFixedModsOneIsNorFivePrimeTerminus(oligoLength, allKnownFixedMods)) { - if (!kvp.ContainsKey(ok.Key)) + if (!variableModPattern.ContainsKey(fixedModPattern.Key)) { numFixedMods++; - kvp.Add(ok.Key, ok.Value); + variableModPattern.Add(fixedModPattern.Key, fixedModPattern.Value); } } yield return new OligoWithSetMods(NucleicAcid, digestionParams, OneBasedStartResidue, OneBasedEndResidue, MissedCleavages, - CleavageSpecificityForFdrCategory, kvp, numFixedMods, _fivePrimeTerminus, _threePrimeTerminus); - variable_modification_isoforms++; - if (variable_modification_isoforms == maximumVariableModificationIsoforms) + CleavageSpecificityForFdrCategory, variableModPattern, numFixedMods, _fivePrimeTerminus, _threePrimeTerminus); + variableModificationIsoforms++; + if (variableModificationIsoforms == maximumVariableModificationIsoforms) { yield break; } diff --git a/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs index f432b5fed..5fab7d6bd 100644 --- a/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs +++ b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs @@ -342,7 +342,7 @@ private Dictionary GetModsAfterDeserialization(Dictionary GetModsAfterDeserialization(Dictionary Digest(IDigestionParams digestionPara bool topDownTruncationSearch = false) { if (digestionParameters is not RnaDigestionParams digestionParams) - throw new ArgumentException( - "DigestionParameters must be of type DigestionParams for protein digestion"); + throw new MzLibException( + "DigestionParameters must be of type DigestionParams for protein digestion", new ArgumentException()); allKnownFixedMods ??= new(); variableModifications ??= new(); @@ -220,10 +221,11 @@ public IEnumerable Digest(RnaDigestionParams digestionParamete public IEnumerable GetElectrospraySeries(int minCharge, int maxCharge) { - for (int i = minCharge; i < maxCharge; i++) - { - yield return this.ToMz(i); - } + if (minCharge > maxCharge) + (minCharge, maxCharge) = (maxCharge, minCharge); + + for (int i = maxCharge; i > minCharge - 1; i--) + yield return this.ToMz(i); } #endregion diff --git a/mzLib/mzLib.sln.DotSettings b/mzLib/mzLib.sln.DotSettings index 6522afcd5..6c67babc8 100644 --- a/mzLib/mzLib.sln.DotSettings +++ b/mzLib/mzLib.sln.DotSettings @@ -1,4 +1,5 @@  + True True True True From b295b63324e02eaffb2cb3270715001194f750ff Mon Sep 17 00:00:00 2001 From: nbollis Date: Tue, 1 Oct 2024 19:01:44 -0500 Subject: [PATCH 16/17] Add test data files and methods for RNA sequence handling Added new test data files (`20mer1.fasta`, `20mer1.fasta.gz`, `20mer1.xml`, `20mer1.xml.gz`) to the `Transcriptomics\TestData` directory in the `Test.csproj` file, ensuring they are copied to the output directory. Introduced `TestDbReadingDifferentExtensions` in `TestDbLoader.cs` to verify RNA database reading from various formats. Added `TestDigestionMaxIsoforms` in `TestDigestion.cs` to test RNA sequence digestion with max isoforms. Updated `WriteNucleicAcidXmlDatabase` in `ProteinDbWriter.cs` with remarks for future implementation. Added a TODO in `RnaDecoyGenerator.cs` regarding palindromic sequences' impact on fragment ions. Included new RNA sequence data in test files for validation. --- mzLib/Test/Test.csproj | 12 ++++++++++ .../Transcriptomics/TestData/20mer1.fasta | 2 ++ .../Transcriptomics/TestData/20mer1.fasta.gz | Bin 0 -> 135 bytes .../Test/Transcriptomics/TestData/20mer1.xml | 17 ++++++++++++++ .../Transcriptomics/TestData/20mer1.xml.gz | Bin 0 -> 254 bytes mzLib/Test/Transcriptomics/TestDbLoader.cs | 22 ++++++++++++++++++ mzLib/Test/Transcriptomics/TestDigestion.cs | 12 +++++++++- .../ProteinDbWriter.cs | 4 ++++ .../Transcriptomics/RnaDecoyGenerator.cs | 1 + 9 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 mzLib/Test/Transcriptomics/TestData/20mer1.fasta create mode 100644 mzLib/Test/Transcriptomics/TestData/20mer1.fasta.gz create mode 100644 mzLib/Test/Transcriptomics/TestData/20mer1.xml create mode 100644 mzLib/Test/Transcriptomics/TestData/20mer1.xml.gz diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index 76ab4ac9b..fa53ca295 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -495,6 +495,18 @@ Always + + Always + + + PreserveNewest + + + Always + + + Always + PreserveNewest diff --git a/mzLib/Test/Transcriptomics/TestData/20mer1.fasta b/mzLib/Test/Transcriptomics/TestData/20mer1.fasta new file mode 100644 index 000000000..c222589c1 --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestData/20mer1.fasta @@ -0,0 +1,2 @@ +>id:2|Name:20mer1|SOterm:20mer1|Type:tRNA|Subtype:Ala|Feature:VGC|Cellular_Localization:freezer|Species:standard +GUACUGCCUCUAGUGAAGCA \ No newline at end of file diff --git a/mzLib/Test/Transcriptomics/TestData/20mer1.fasta.gz b/mzLib/Test/Transcriptomics/TestData/20mer1.fasta.gz new file mode 100644 index 0000000000000000000000000000000000000000..2fe54f9ab05d0cf835bd1fc6f7741d53388df7c7 GIT binary patch literal 135 zcmV;20C@i&iwFRRiTq{&12xCJ3IZ_<0N`ElUG&BbLI{{ePO#Qh;x$5{_d~87Jo513 z;J1CJoS^ewMuqM$svSLCtJ?VA{j(_4TaO;vtNwyT{H#)&DLj~*w3JrFo|k!$k~gV& pp5Se&o0^A3hpa2CRi-4C_B)P9k%=Nj + + + 20mer1 + 20mer1 + + + 20mer1 + + + + + standard + + GUACUGCCUCUAGUGAAGCA + + \ No newline at end of file diff --git a/mzLib/Test/Transcriptomics/TestData/20mer1.xml.gz b/mzLib/Test/Transcriptomics/TestData/20mer1.xml.gz new file mode 100644 index 0000000000000000000000000000000000000000..19dac16bf1c4a229db2685f1897563244d556383 GIT binary patch literal 254 zcmVR&4(a5zt6n!yzkF1et5S%c>^De!<_E!GfDupYYcWd9b$X;Ko7<34Ywap zrhfJ=g2Bf+F@$iy=+EOb0i`vBOG>fNEchK~D>GS8SyM#W`{VDRg;=K9|Kpr(_wWu72%C}fokA1rnsfOgiRTmKG-#8qHLcOK2vK39e6ziQ z{h8+hUWZgAkRGs$FFD<3v?wbft5V8JR-&v*AxbGYo2}Pc+1x8 rna; + if (dbPath.Contains("fasta")) + rna = RnaDbLoader.LoadRnaFasta(dbPath, true, DecoyType.None, false, + out var errors); + else + rna = RnaDbLoader.LoadRnaXML(dbPath, true, DecoyType.None, false, + new List(), new List(), out _); + + Assert.That(rna.Count, Is.EqualTo(1)); + Assert.That(rna.First().BaseSequence, Is.EqualTo("GUACUGCCUCUAGUGAAGCA")); + } } } diff --git a/mzLib/Test/Transcriptomics/TestDigestion.cs b/mzLib/Test/Transcriptomics/TestDigestion.cs index a46017c0e..acfcacdef 100644 --- a/mzLib/Test/Transcriptomics/TestDigestion.cs +++ b/mzLib/Test/Transcriptomics/TestDigestion.cs @@ -1168,8 +1168,18 @@ public static void TestDatabaseAnnotatedMods_TerminalMods_WithFirstResidueVariab } } + [Test] + public static void TestDigestionMaxIsoforms() + { + var rna = new RNA("GUACUAGACUACAUGGUACAUCA"); + var rnaDigestionParams = new RnaDigestionParams(); + var variableMods = SodiumAdducts.Concat(PotassiumAdducts) + .Concat(TerminalPotassiumAdducts).Concat(TerminalSodiumAdducts).ToList(); - + var digestionProducts = rna.Digest(rnaDigestionParams, new List(), variableMods) + .ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(rnaDigestionParams.MaxModificationIsoforms)); + } #endregion } diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs index d47912f4d..dadba9e11 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs @@ -37,6 +37,10 @@ public static Dictionary WriteXmlDatabase( /// A list of nucleic acid sequences to be written to the database. /// The name of the output XML file. /// A dictionary of new modification residue entries. + /// + /// Several chunks of code are commented out. These are blocks that are intended to be implmented in the future, but + /// are not necessary for the bare bones implementation of Transcriptomics + /// private static Dictionary WriteNucleicAcidXmlDatabase( Dictionary>> additionalModsToAddToProteins, List nucleicAcidList, string outputFileName) diff --git a/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDecoyGenerator.cs b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDecoyGenerator.cs index 6bd25e31f..b9cc20e1d 100644 --- a/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDecoyGenerator.cs +++ b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDecoyGenerator.cs @@ -19,6 +19,7 @@ namespace UsefulProteomicsDatabases.Transcriptomics /// The GenerateDecoys method serves as the main entry point, delegating to specific decoy generation methods based on the specified . /// TODO: Implement Shuffle and Slide Decoys /// TODO: Consider passing digestion motif as optional parameter to leave digestion sites intact. Currently leaving the 3' intact as it is the predominant cleavage motif. + /// TODO: Consider palindromic sequences and the result they have on fragment ions (d/z are identical, c/y are identical). This will be particularly important for slided decoys /// public static class RnaDecoyGenerator { From 04f7e67aadf082b8949ee7a10c50e3559988e51e Mon Sep 17 00:00:00 2001 From: nbollis Date: Tue, 8 Oct 2024 12:56:27 -0500 Subject: [PATCH 17/17] Added test coverage to the localize method within BioPolymerWithSetMods --- mzLib/Omics/IBioPolymerWithSetMods.cs | 11 ++- .../PeptideWithSetModifications.cs | 8 +- .../Transcriptomics/TestOligoWithSetMods.cs | 79 +++++++++++++++++++ .../Digestion/OligoWithSetMods.cs | 16 +++- mzLib/Transcriptomics/RNA.cs | 1 + 5 files changed, 106 insertions(+), 9 deletions(-) create mode 100644 mzLib/Test/Transcriptomics/TestOligoWithSetMods.cs diff --git a/mzLib/Omics/IBioPolymerWithSetMods.cs b/mzLib/Omics/IBioPolymerWithSetMods.cs index 1c3ade66a..0b9926a01 100644 --- a/mzLib/Omics/IBioPolymerWithSetMods.cs +++ b/mzLib/Omics/IBioPolymerWithSetMods.cs @@ -50,7 +50,16 @@ public void Fragment(DissociationType dissociationType, FragmentationTerminus fr public void FragmentInternally(DissociationType dissociationType, int minLengthOfFragments, List products); - public IBioPolymerWithSetMods Localize(int j, double massToLocalize); + /// + /// Outputs a duplicate IBioPolymerWithSetMods with a localized mass shift, replacing a modification when present + /// + /// Used to localize an unknown mass shift in the MetaMorpheus Localization Engine + /// + /// + /// The index of the modification in the AllModOneIsNTerminus Dictionary - 2 (idk why -2) + /// The mass to add to the BioPolymer + /// + public IBioPolymerWithSetMods Localize(int indexOfMass, double massToLocalize); public static string GetBaseSequenceFromFullSequence(string fullSequence) { diff --git a/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs b/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs index 1b7d32d61..8eb6e6bdf 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs @@ -613,17 +613,17 @@ public void FragmentInternally(DissociationType dissociationType, int minLengthO } } - public IBioPolymerWithSetMods Localize(int j, double massToLocalize) + public IBioPolymerWithSetMods Localize(int indexOfMass, double massToLocalize) { var dictWithLocalizedMass = new Dictionary(AllModsOneIsNterminus); double massOfExistingMod = 0; - if (dictWithLocalizedMass.TryGetValue(j + 2, out Modification modToReplace)) + if (dictWithLocalizedMass.TryGetValue(indexOfMass + 2, out Modification modToReplace)) { massOfExistingMod = (double)modToReplace.MonoisotopicMass; - dictWithLocalizedMass.Remove(j + 2); + dictWithLocalizedMass.Remove(indexOfMass + 2); } - dictWithLocalizedMass.Add(j + 2, new Modification(_locationRestriction: "Anywhere.", _monoisotopicMass: massToLocalize + massOfExistingMod)); + dictWithLocalizedMass.Add(indexOfMass + 2, new Modification(_locationRestriction: "Anywhere.", _monoisotopicMass: massToLocalize + massOfExistingMod)); var peptideWithLocalizedMass = new PeptideWithSetModifications(Protein, _digestionParams, OneBasedStartResidueInProtein, OneBasedEndResidueInProtein, CleavageSpecificityForFdrCategory, PeptideDescription, MissedCleavages, dictWithLocalizedMass, NumFixedMods); diff --git a/mzLib/Test/Transcriptomics/TestOligoWithSetMods.cs b/mzLib/Test/Transcriptomics/TestOligoWithSetMods.cs new file mode 100644 index 000000000..6255ef2fd --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestOligoWithSetMods.cs @@ -0,0 +1,79 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using NUnit.Framework; +using Omics.Modifications; +using Transcriptomics.Digestion; +using Transcriptomics; + +namespace Test.Transcriptomics +{ + [ExcludeFromCodeCoverage] + public static class TestOligoWithSetMods + { + [Test] + [TestCase( 0, 1, 20.45)] + [TestCase(1, 1, 20.45)] + [TestCase( 0, 2, 20.45)] + [TestCase(1, 2, 20.45)] + [TestCase( 0, 5, 28.37)] + [TestCase(1, 5, 28.37)] + [TestCase( 0, 6, 28.37)] + [TestCase(1, 6, 28.37)] + public static void TestLocalize(int modsOnOligo, int indexOfMass, double massToLocalize) + { + var oligoWithSetMods = new RNA("GUACUG", + oneBasedPossibleLocalizedModifications: new Dictionary> { { 4, [TestDigestion.PotassiumAdducts[1]] } }) + .Digest(new RnaDigestionParams(), [], []) + .ElementAt(modsOnOligo); + + Assert.That(oligoWithSetMods.AllModsOneIsNterminus.Count, Is.EqualTo(modsOnOligo)); + + // Act + var localizedOligo = oligoWithSetMods.Localize(indexOfMass - 2, massToLocalize); + + // Assert + int expectedModificationCount; + double expectedMass; + if (modsOnOligo == 1) // if the oligo started with a mod + { + int indexOfOriginalMod = oligoWithSetMods.AllModsOneIsNterminus.Keys.First(); + + // ensure original modification exist + Assert.That(localizedOligo.AllModsOneIsNterminus.ContainsKey(indexOfOriginalMod)); + + if (indexOfOriginalMod != indexOfMass) // Additional mass was added to a different location + { + expectedModificationCount = modsOnOligo + 1; + expectedMass = massToLocalize; + + // ensure original modification is still intact + Assert.That(oligoWithSetMods.OneBasedPossibleLocalizedModifications[indexOfOriginalMod][0].MonoisotopicMass, + Is.EqualTo(localizedOligo.AllModsOneIsNterminus[indexOfOriginalMod].MonoisotopicMass)); + } + else // Additional mass was added to the location of an existing modification + { + expectedModificationCount = modsOnOligo; + expectedMass = massToLocalize + TestDigestion.PotassiumAdducts[1].MonoisotopicMass!.Value; + + // ensure original modification has been altered + Assert.That(oligoWithSetMods.OneBasedPossibleLocalizedModifications[indexOfOriginalMod][0].MonoisotopicMass, + Is.Not.EqualTo(localizedOligo.AllModsOneIsNterminus[indexOfOriginalMod].MonoisotopicMass)); + } + } + else // oligo started with no modifications + { + expectedModificationCount = modsOnOligo + 1; + expectedMass = massToLocalize; + } + + + Assert.That(expectedModificationCount, Is.EqualTo(localizedOligo.AllModsOneIsNterminus.Count)); + Assert.That(localizedOligo.AllModsOneIsNterminus.ContainsKey(indexOfMass)); + Assert.That(expectedMass, Is.EqualTo(localizedOligo.AllModsOneIsNterminus[indexOfMass].MonoisotopicMass)); + } + } +} diff --git a/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs index 5fab7d6bd..19902f57e 100644 --- a/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs +++ b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs @@ -286,17 +286,25 @@ public IEnumerable GetNeutralFragments(ProductType type, Nucleotide[]? } } - public IBioPolymerWithSetMods Localize(int j, double massToLocalize) + /// + /// Outputs a duplicate IBioPolymerWithSetMods with a localized mass shift, replacing a modification when present + /// + /// Used to localize an unknown mass shift in the MetaMorpheus Localization Engine + /// + /// + /// The index of the modification in the AllModOneIsNTerminus Dictionary - 2 (idk why -2) + /// The mass to add to the BioPolymer + public IBioPolymerWithSetMods Localize(int indexOfMass, double massToLocalize) { var dictWithLocalizedMass = new Dictionary(AllModsOneIsNterminus); double massOfExistingMod = 0; - if (dictWithLocalizedMass.TryGetValue(j + 2, out Modification modToReplace)) + if (dictWithLocalizedMass.TryGetValue(indexOfMass + 2, out Modification modToReplace)) { massOfExistingMod = (double)modToReplace.MonoisotopicMass; - dictWithLocalizedMass.Remove(j + 2); + dictWithLocalizedMass.Remove(indexOfMass + 2); } - dictWithLocalizedMass.Add(j + 2, new Modification(_locationRestriction: "Anywhere.", _monoisotopicMass: massToLocalize + massOfExistingMod)); + dictWithLocalizedMass.Add(indexOfMass + 2, new Modification(_locationRestriction: "Anywhere.", _monoisotopicMass: massToLocalize + massOfExistingMod)); var peptideWithLocalizedMass = new OligoWithSetMods(NucleicAcid, _digestionParams, OneBasedStartResidue, OneBasedEndResidue, MissedCleavages, CleavageSpecificityForFdrCategory, dictWithLocalizedMass, NumFixedMods, FivePrimeTerminus, ThreePrimeTerminus); diff --git a/mzLib/Transcriptomics/RNA.cs b/mzLib/Transcriptomics/RNA.cs index 5d5fcb2f6..41e3a64e9 100644 --- a/mzLib/Transcriptomics/RNA.cs +++ b/mzLib/Transcriptomics/RNA.cs @@ -31,6 +31,7 @@ public RNA(string sequence, IHasChemicalFormula? fivePrimeTerm = null, IHasChemi /// /// /// + /// /// public RNA(string sequence, string name, string identifier, string organism, string databaseFilePath, IHasChemicalFormula? fivePrimeTerminus = null, IHasChemicalFormula? threePrimeTerminus = null,