Merged master into quandenser-pipeline branch

statisticalbiotechnology · Sep 2, 2019 · db9af7c · db9af7c
2 parents 9a1c401 + bab568f
commit db9af7c
Show file tree

Hide file tree

Showing 9 changed files with 75 additions and 21 deletions.
diff --git a/bin/normalize_intensities.py b/bin/normalize_intensities.py
@@ -64,7 +64,7 @@ def normalizeIntensitiesWithFactorArrays(clusterQuantExtraFile, rTimeFactorArray
   for key in rTimeFactorArrays:
     rTimeArrays[key], factorArrays[key] = zip(*rTimeFactorArrays[key])
   print("Writing", clusterQuantExtraNormalizedFile)
-  writer = csv.writer(open(clusterQuantExtraNormalizedFile, 'w'), delimiter = '\t')
+  writer = parsers.getTsvWriter(clusterQuantExtraNormalizedFile)
   precClusters = parsers.parseFeatureClustersFile(clusterQuantExtraFile)
   for i, precCluster in enumerate(precClusters):
     for row in precCluster:

diff --git a/bin/percolator.py b/bin/percolator.py
@@ -10,6 +10,8 @@
 import csv
 import collections
 
+from triqler import parsers
+
 PercolatorPoutPsmsBase = collections.namedtuple("PercolatorPoutPsms", "id filename scannr charge svm_score qvalue PEP peptide proteins")
 
 class PercolatorPoutPsms(PercolatorPoutPsmsBase):
@@ -21,7 +23,7 @@ def toString(self):
 
 # works for peptide and psms pouts
 def parsePsmsPout(poutFile, qThresh = 1.0, proteinMap = None, parseId = True, fixScannr = False):
-  reader = csv.reader(open(poutFile, 'r'), delimiter='\t')
+  reader = parsers.getTsvReader(poutFile)
   headers = next(reader) # save the header
 
   cruxOutput = True if "percolator score" in headers else False
@@ -50,7 +52,7 @@ def parsePsmsPout(poutFile, qThresh = 1.0, proteinMap = None, parseId = True, fi
         proteins = map(proteinMap, proteins)
 
       if cruxOutput:
-        yield PercolatorPoutPsms(row[fileIdxCol] + "_" + row[scanCol] + "_" + row[chargeCol], "", int(row[scanCol]), int(row[chargeCol]), float(row[scoreCol]), float(row[qvalCol]), 1.0, row[terminalsCol][0] + "." + row[peptideCol] + row[terminalsCol][1], proteins)
+        yield PercolatorPoutPsms(row[fileIdxCol] + "_" + row[scanCol] + "_" + row[chargeCol], "", int(row[scanCol]), int(row[chargeCol]), float(row[scoreCol]), float(row[qvalCol]), 1.0, row[terminalsCol][0] + "." + row[peptideCol] + "." + row[terminalsCol][1], proteins)
       elif parseId:
         yield PercolatorPoutPsms(row[0], getFileName(row[0], fixScannr), getId(row[0], fixScannr), getCharge(row[0]), float(row[1]), float(row[qvalCol]), float(row[3]), row[4], proteins)
       else:

diff --git a/bin/prepare_input.py b/bin/prepare_input.py
@@ -31,8 +31,9 @@ def main(argv):
 
   simpleOutputFormat = False # skips the linkPEP column from Quandenser output
   skipNormalization = False # skips retention time based feature intensity normalization
+  retainUnidentified = False # keeps features without identification in the Triqler input file
   try:
-    opts, args = getopt.getopt(argv,"hi:l:o:f:q:e:sn",["ifile=","filelist=","featurefile=","simple_output=","no_normalization="])
+    opts, args = getopt.getopt(argv,"hi:l:o:f:q:e:snu",["ifile=","filelist=","featurefile=","simple_output=","no_normalization=","retain_unidentified"])
   except getopt.GetoptError:
     print(helpMessage)
     sys.exit(2)
@@ -50,17 +51,25 @@ def main(argv):
       simpleOutputFormat = True
     elif opt in ("-n", "--no_normalization"):
       skipNormalization = True
+    elif opt in ("-u", "retain_unidentified"):
+      retainUnidentified = True
     elif opt in ("-q"):
       peptQuantRowFile = arg
 
   if not skipNormalization:
     clusterQuantFileNormalized = clusterQuantFile.replace(".tsv", ".normalized.tsv")
     if not os.path.isfile(clusterQuantFileNormalized):
+      print("Applying retention-time dependent intensity normalization")
       normalize.normalizeIntensitiesRtimeBased(clusterQuantFile, clusterQuantFileNormalized)
+    else:
+      print("Reusing previously generated normalized feature group file:", clusterQuantFileNormalized, ". Remove this file to redo normalization")
     clusterQuantFile = clusterQuantFileNormalized
+  else:
+    print("Skipping retention-time dependent intensity normalization")
 
   params = dict()
   params['simpleOutputFormat'] = simpleOutputFormat
+  params['retainUnidentified'] = retainUnidentified
   fileList, params['groups'], params['groupLabels'] = parsers.parseFileList(fileListFile)
   specToPeptideMap = parsePsmsPoutFiles(psmsOutputFiles)
 
@@ -85,7 +94,7 @@ def printTriqlerInputFile(fileList, clusterQuantFile, quantRowFile, specToPeptid
 
   fileNameConditionPairs = [[x.split("/")[-1], parsers.getGroupLabel(idx, params['groups'], params['groupLabels'])] for idx, x in enumerate(fileList)]
 
-  writer = csv.writer(open(quantRowFile, 'w'), delimiter = '\t')
+  writer = parsers.getTsvWriter(quantRowFile)
   if params['simpleOutputFormat']:
     writer.writerow(parsers.TriqlerSimpleInputRowHeaders)
   else:
@@ -103,7 +112,7 @@ def printTriqlerInputFile(fileList, clusterQuantFile, quantRowFile, specToPeptid
       for peptLinkPEP in pc.peptLinkPEPs.split(","):
         spectrumIdx, linkPEP = parsePeptideLinkPEP(peptLinkPEP)
         peptide, identPEP, proteins, searchScore, charge = specToPeptideMap(spectrumIdx)
-        if pc.intensity > 0.0 and linkPEP < 1.0:
+        if pc.intensity > 0.0 and linkPEP < 1.0 and (params["retainUnidentified"] or peptide != "NA"):
           # run condition charge spectrumId linkPEP featureClusterId search_score intensity peptide proteins
           run, condition = fileNameConditionPairs[fileIdx]
           row = parsers.TriqlerInputRow(run, condition, charge, spectrumIdx, linkPEP, featureClusterIdx, searchScore, pc.intensity, peptide, proteins)

diff --git a/src/DinosaurFeatureList.h b/src/DinosaurFeatureList.h
@@ -92,7 +92,9 @@ class DinosaurFeatureList {
   inline void sortByPrecMz() { std::sort(features_.begin(), features_.end(), lessPrecMz); }
   inline void sortByFeatureIdx() { std::sort(features_.begin(), features_.end(), lessFeatureIdx); }
 
-  inline static bool lessPrecMz(const DinosaurFeature& a, const DinosaurFeature& b) { return (a.precMz < b.precMz); }
+  inline static bool lessPrecMz(const DinosaurFeature& a, const DinosaurFeature& b) { 
+    return (a.precMz < b.precMz) || (a.precMz == b.precMz && a.rTime < b.rTime); 
+  }
   inline static bool lessFeatureIdx(const DinosaurFeature& a, const DinosaurFeature& b) { return (a.featureIdx < b.featureIdx); }
   inline std::vector<DinosaurFeature>::const_iterator getPrecMzIterator(float precMz) const {
     return std::lower_bound(begin(), end(), precMz, DinosaurFeatureLessPrecMzLower());

diff --git a/src/DinosaurIO.cpp b/src/DinosaurIO.cpp
@@ -24,6 +24,7 @@ const std::string DinosaurIO::kDinosaurJar = "\"" + Globals::getJarPath() + std:
 
 std::string DinosaurIO::javaMemory_ = "24000M";
 int DinosaurIO::javaNumThreads_ = 4;
+int DinosaurIO::seed_ = 1;
 
 void DinosaurIO::setJavaMemory(std::string mem) {
   char lastChar = std::tolower(*mem.rbegin());
@@ -47,7 +48,11 @@ int DinosaurIO::runDinosaurTargeted(const std::string& outputDir, const std::str
 }
 
 int DinosaurIO::runDinosaur(const std::string& dinosaurCmd) {
-  std::string cmd = "java -Xmx" + javaMemory_ + " -jar " + kDinosaurJar + " --force --concurrency=" + boost::lexical_cast<std::string>(javaNumThreads_) + " --profiling=true --nReport=0 " + dinosaurCmd;
+  std::string cmd = "java -Xmx" + javaMemory_ + " -jar " + kDinosaurJar +
+      " --force  --profiling=true --nReport=0 " +
+      " --concurrency=" + boost::lexical_cast<std::string>(javaNumThreads_) +
+      " --seed=" + boost::lexical_cast<std::string>(seed_) + " " + 
+      dinosaurCmd;
   if (Globals::VERB > 2) {
     std::cerr << cmd << std::endl;
   }
@@ -66,18 +71,26 @@ void DinosaurIO::parseDinosaurFeatureFile(std::istream& dataStream, int fileIdx,
 
   getline(dataStream, featureLine); // skip header
   size_t lineNr = 2;
+  std::vector<DinosaurFeature> ftVec;
   while (getline(dataStream, featureLine)) {
     if (lineNr % 1000000 == 0 && Globals::VERB > 1) {
       std::cerr << "Processing line " << lineNr << std::endl;
     }
     DinosaurFeature ft = parseDinosaurFeatureRow(featureLine);
-    ft.featureIdx = lineNr - 2;
-    ft.fileIdx = fileIdx;
-
-    dinosaurFeatures.push_back(ft);
+    ftVec.push_back(ft);
 
     ++lineNr;
   }
+
+  std::sort(ftVec.begin(), ftVec.end(), DinosaurFeatureList::lessPrecMz);
+
+  size_t ftIdx = 0;
+  for (std::vector<DinosaurFeature>::iterator it = ftVec.begin(); it != ftVec.end(); ++it, ++ftIdx) {
+    it->featureIdx = ftIdx;
+    it->fileIdx = fileIdx;
+
+    dinosaurFeatures.push_back(*it);
+  }
 }
 
 DinosaurFeature DinosaurIO::parseDinosaurFeatureRow(std::string& line) {

diff --git a/src/DinosaurIO.h b/src/DinosaurIO.h
@@ -39,6 +39,9 @@ class DinosaurIO {
   static void setJavaNumThreads(int numThreads) {
     javaNumThreads_ = numThreads;
   }
+  static void setSeed(int seed) {
+    seed_ = seed;
+  }
 
   static void parseDinosaurFeatureFile(std::istream& dataStream, int fileIdx, DinosaurFeatureList& dinosaurFeatures);
   static int runDinosaurGlobal(const std::string& outputDir, const std::string& mzMLFile);
@@ -51,6 +54,7 @@ class DinosaurIO {
 
   static std::string javaMemory_;
   static int javaNumThreads_;
+  static int seed_;
 
 
 };

diff --git a/src/FeatureAlignment.cpp b/src/FeatureAlignment.cpp
@@ -47,13 +47,16 @@ void FeatureAlignment::matchFeatures(
     AlignRetention& alignRetention,
     std::vector<DinosaurFeatureList>& allFeatures) {
   std::vector<std::pair<int, FilePair> >::const_iterator filePairIt;
-  for (filePairIt = featureAlignmentQueue.begin();
-        filePairIt != featureAlignmentQueue.end(); ++filePairIt) {
+  size_t alignmentCnt = 1u;
+  for (filePairIt = featureAlignmentQueue.begin(); 
+        filePairIt != featureAlignmentQueue.end(); ++filePairIt, ++alignmentCnt) {
     FilePair filePair = filePairIt->second;
 
     if (Globals::VERB > 1) {
-      std::cerr << "Matching features " << filePair.fileIdx1
-                << "->" << filePair.fileIdx2 << std::endl;
+      std::cerr << "Matching features " << filePair.fileIdx1 
+                << "->" << filePair.fileIdx2 
+                << " (" << alignmentCnt << "/" 
+                << featureAlignmentQueue.size() << ")" << std::endl;
     }
     std::string targetMzMLFile = fileList.getFilePath(filePair.fileIdx2);
     getFeatureMap(filePair, targetMzMLFile,
@@ -150,7 +153,11 @@ void FeatureAlignment::matchFeatures(
   percolatorArgs.push_back(percolatorOutputFile);
   percolatorArgs.push_back("--decoy-results-psms");
   percolatorArgs.push_back(percolatorOutputFile + ".decoys");
-
+  /*
+  percolatorArgs.push_back("--tab-out");
+  percolatorArgs.push_back(percolatorOutputFile + ".pin");
+  */
+
   PercolatorAdapter percolatorAdapter;
   percolatorAdapter.parseOptions(percolatorArgs);
   percolatorAdapter.init(kLinkFeatureNames);

diff --git a/src/Quandenser.cpp b/src/Quandenser.cpp
@@ -18,7 +18,7 @@
 
 namespace quandenser {
 
-Quandenser::Quandenser() : call_(""), fnPrefix_("Quandenser"),
+Quandenser::Quandenser() : call_(""), fnPrefix_("Quandenser"), seed_(1u),
     maxMissingValues_(-1), intensityScoreThreshold_(0.5),
     spectrumBatchFileFN_(""), outputFolder_("Quandenser_output"),
     outputSpectrumFile_(""), maraclusterPpmTol_(20.0f),
@@ -85,6 +85,10 @@ bool Quandenser::parseOptions(int argc, char **argv) {
       "output-folder",
       "Writable folder for output files (default: 'Quandenser_output')",
       "path");
+  cmd.defineOption("s",
+      "seed",
+      "Seed for pseudo random number generators of Dinosaur and Percolator (default: 1)",
+      "int");
   cmd.defineOption("a",
       "prefix",
       "Output files will be prefixed as e.g. <prefix>.clusters_p10.tsv (default: 'Quandenser')",
@@ -182,7 +186,14 @@ bool Quandenser::parseOptions(int argc, char **argv) {
   if (cmd.optionSet("output-folder")) {
     outputFolder_ = cmd.options["output-folder"];
   }
-
+
+  if (cmd.optionSet("seed")) {
+    int seed = cmd.getInt("seed", 1, 20000); // [1,20000] are Percolator's limits for the seed
+    percolatorArgs_.push_back("--seed");
+    percolatorArgs_.push_back(cmd.options["seed"]);
+    DinosaurIO::setSeed(seed);
+  }
+
   if (cmd.optionSet("spec-out")) {
     outputSpectrumFile_ = cmd.options["spec-out"];
   }
@@ -362,9 +373,15 @@ int Quandenser::run() {
     std::cerr << "Error: could not create output directory at " << outputFolder_ << std::endl;
     return EXIT_FAILURE;
   }
+
   maracluster::SpectrumFileList fileList;
   fileList.initFromFile(spectrumBatchFileFN_);
-
+
+  if (fileList.size() <= 2u) {
+    std::cerr << "Error: less than 2 spectrum files were specified, Quandenser needs at least two files to perform a meaningful alignment." << std::endl;
+    return EXIT_FAILURE;
+  }
+
   boost::filesystem::path dinosaurFolder(rootPath);
   dinosaurFolder /= "dinosaur";
   boost::filesystem::create_directories(dinosaurFolder, returnedError);

diff --git a/src/Quandenser.h b/src/Quandenser.h
@@ -65,7 +65,7 @@ class Quandenser {
   std::string outputFolder_;
   std::string outputSpectrumFile_;
   std::string fnPrefix_;
-
+  int seed_;
   int maxMissingValues_;
   float intensityScoreThreshold_;