Skip to content

Commit

Permalink
Merged master into quandenser-pipeline branch
Browse files Browse the repository at this point in the history
  • Loading branch information
MatthewThe committed Sep 2, 2019
2 parents 9a1c401 + bab568f commit db9af7c
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 21 deletions.
2 changes: 1 addition & 1 deletion bin/normalize_intensities.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def normalizeIntensitiesWithFactorArrays(clusterQuantExtraFile, rTimeFactorArray
for key in rTimeFactorArrays:
rTimeArrays[key], factorArrays[key] = zip(*rTimeFactorArrays[key])
print("Writing", clusterQuantExtraNormalizedFile)
writer = csv.writer(open(clusterQuantExtraNormalizedFile, 'w'), delimiter = '\t')
writer = parsers.getTsvWriter(clusterQuantExtraNormalizedFile)
precClusters = parsers.parseFeatureClustersFile(clusterQuantExtraFile)
for i, precCluster in enumerate(precClusters):
for row in precCluster:
Expand Down
6 changes: 4 additions & 2 deletions bin/percolator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import csv
import collections

from triqler import parsers

PercolatorPoutPsmsBase = collections.namedtuple("PercolatorPoutPsms", "id filename scannr charge svm_score qvalue PEP peptide proteins")

class PercolatorPoutPsms(PercolatorPoutPsmsBase):
Expand All @@ -21,7 +23,7 @@ def toString(self):

# works for peptide and psms pouts
def parsePsmsPout(poutFile, qThresh = 1.0, proteinMap = None, parseId = True, fixScannr = False):
reader = csv.reader(open(poutFile, 'r'), delimiter='\t')
reader = parsers.getTsvReader(poutFile)
headers = next(reader) # save the header

cruxOutput = True if "percolator score" in headers else False
Expand Down Expand Up @@ -50,7 +52,7 @@ def parsePsmsPout(poutFile, qThresh = 1.0, proteinMap = None, parseId = True, fi
proteins = map(proteinMap, proteins)

if cruxOutput:
yield PercolatorPoutPsms(row[fileIdxCol] + "_" + row[scanCol] + "_" + row[chargeCol], "", int(row[scanCol]), int(row[chargeCol]), float(row[scoreCol]), float(row[qvalCol]), 1.0, row[terminalsCol][0] + "." + row[peptideCol] + row[terminalsCol][1], proteins)
yield PercolatorPoutPsms(row[fileIdxCol] + "_" + row[scanCol] + "_" + row[chargeCol], "", int(row[scanCol]), int(row[chargeCol]), float(row[scoreCol]), float(row[qvalCol]), 1.0, row[terminalsCol][0] + "." + row[peptideCol] + "." + row[terminalsCol][1], proteins)
elif parseId:
yield PercolatorPoutPsms(row[0], getFileName(row[0], fixScannr), getId(row[0], fixScannr), getCharge(row[0]), float(row[1]), float(row[qvalCol]), float(row[3]), row[4], proteins)
else:
Expand Down
15 changes: 12 additions & 3 deletions bin/prepare_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ def main(argv):

simpleOutputFormat = False # skips the linkPEP column from Quandenser output
skipNormalization = False # skips retention time based feature intensity normalization
retainUnidentified = False # keeps features without identification in the Triqler input file
try:
opts, args = getopt.getopt(argv,"hi:l:o:f:q:e:sn",["ifile=","filelist=","featurefile=","simple_output=","no_normalization="])
opts, args = getopt.getopt(argv,"hi:l:o:f:q:e:snu",["ifile=","filelist=","featurefile=","simple_output=","no_normalization=","retain_unidentified"])
except getopt.GetoptError:
print(helpMessage)
sys.exit(2)
Expand All @@ -50,17 +51,25 @@ def main(argv):
simpleOutputFormat = True
elif opt in ("-n", "--no_normalization"):
skipNormalization = True
elif opt in ("-u", "retain_unidentified"):
retainUnidentified = True
elif opt in ("-q"):
peptQuantRowFile = arg

if not skipNormalization:
clusterQuantFileNormalized = clusterQuantFile.replace(".tsv", ".normalized.tsv")
if not os.path.isfile(clusterQuantFileNormalized):
print("Applying retention-time dependent intensity normalization")
normalize.normalizeIntensitiesRtimeBased(clusterQuantFile, clusterQuantFileNormalized)
else:
print("Reusing previously generated normalized feature group file:", clusterQuantFileNormalized, ". Remove this file to redo normalization")
clusterQuantFile = clusterQuantFileNormalized
else:
print("Skipping retention-time dependent intensity normalization")

params = dict()
params['simpleOutputFormat'] = simpleOutputFormat
params['retainUnidentified'] = retainUnidentified
fileList, params['groups'], params['groupLabels'] = parsers.parseFileList(fileListFile)
specToPeptideMap = parsePsmsPoutFiles(psmsOutputFiles)

Expand All @@ -85,7 +94,7 @@ def printTriqlerInputFile(fileList, clusterQuantFile, quantRowFile, specToPeptid

fileNameConditionPairs = [[x.split("/")[-1], parsers.getGroupLabel(idx, params['groups'], params['groupLabels'])] for idx, x in enumerate(fileList)]

writer = csv.writer(open(quantRowFile, 'w'), delimiter = '\t')
writer = parsers.getTsvWriter(quantRowFile)
if params['simpleOutputFormat']:
writer.writerow(parsers.TriqlerSimpleInputRowHeaders)
else:
Expand All @@ -103,7 +112,7 @@ def printTriqlerInputFile(fileList, clusterQuantFile, quantRowFile, specToPeptid
for peptLinkPEP in pc.peptLinkPEPs.split(","):
spectrumIdx, linkPEP = parsePeptideLinkPEP(peptLinkPEP)
peptide, identPEP, proteins, searchScore, charge = specToPeptideMap(spectrumIdx)
if pc.intensity > 0.0 and linkPEP < 1.0:
if pc.intensity > 0.0 and linkPEP < 1.0 and (params["retainUnidentified"] or peptide != "NA"):
# run condition charge spectrumId linkPEP featureClusterId search_score intensity peptide proteins
run, condition = fileNameConditionPairs[fileIdx]
row = parsers.TriqlerInputRow(run, condition, charge, spectrumIdx, linkPEP, featureClusterIdx, searchScore, pc.intensity, peptide, proteins)
Expand Down
4 changes: 3 additions & 1 deletion src/DinosaurFeatureList.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,9 @@ class DinosaurFeatureList {
inline void sortByPrecMz() { std::sort(features_.begin(), features_.end(), lessPrecMz); }
inline void sortByFeatureIdx() { std::sort(features_.begin(), features_.end(), lessFeatureIdx); }

inline static bool lessPrecMz(const DinosaurFeature& a, const DinosaurFeature& b) { return (a.precMz < b.precMz); }
inline static bool lessPrecMz(const DinosaurFeature& a, const DinosaurFeature& b) {
return (a.precMz < b.precMz) || (a.precMz == b.precMz && a.rTime < b.rTime);
}
inline static bool lessFeatureIdx(const DinosaurFeature& a, const DinosaurFeature& b) { return (a.featureIdx < b.featureIdx); }
inline std::vector<DinosaurFeature>::const_iterator getPrecMzIterator(float precMz) const {
return std::lower_bound(begin(), end(), precMz, DinosaurFeatureLessPrecMzLower());
Expand Down
23 changes: 18 additions & 5 deletions src/DinosaurIO.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ const std::string DinosaurIO::kDinosaurJar = "\"" + Globals::getJarPath() + std:

std::string DinosaurIO::javaMemory_ = "24000M";
int DinosaurIO::javaNumThreads_ = 4;
int DinosaurIO::seed_ = 1;

void DinosaurIO::setJavaMemory(std::string mem) {
char lastChar = std::tolower(*mem.rbegin());
Expand All @@ -47,7 +48,11 @@ int DinosaurIO::runDinosaurTargeted(const std::string& outputDir, const std::str
}

int DinosaurIO::runDinosaur(const std::string& dinosaurCmd) {
std::string cmd = "java -Xmx" + javaMemory_ + " -jar " + kDinosaurJar + " --force --concurrency=" + boost::lexical_cast<std::string>(javaNumThreads_) + " --profiling=true --nReport=0 " + dinosaurCmd;
std::string cmd = "java -Xmx" + javaMemory_ + " -jar " + kDinosaurJar +
" --force --profiling=true --nReport=0 " +
" --concurrency=" + boost::lexical_cast<std::string>(javaNumThreads_) +
" --seed=" + boost::lexical_cast<std::string>(seed_) + " " +
dinosaurCmd;
if (Globals::VERB > 2) {
std::cerr << cmd << std::endl;
}
Expand All @@ -66,18 +71,26 @@ void DinosaurIO::parseDinosaurFeatureFile(std::istream& dataStream, int fileIdx,

getline(dataStream, featureLine); // skip header
size_t lineNr = 2;
std::vector<DinosaurFeature> ftVec;
while (getline(dataStream, featureLine)) {
if (lineNr % 1000000 == 0 && Globals::VERB > 1) {
std::cerr << "Processing line " << lineNr << std::endl;
}
DinosaurFeature ft = parseDinosaurFeatureRow(featureLine);
ft.featureIdx = lineNr - 2;
ft.fileIdx = fileIdx;

dinosaurFeatures.push_back(ft);
ftVec.push_back(ft);

++lineNr;
}

std::sort(ftVec.begin(), ftVec.end(), DinosaurFeatureList::lessPrecMz);

size_t ftIdx = 0;
for (std::vector<DinosaurFeature>::iterator it = ftVec.begin(); it != ftVec.end(); ++it, ++ftIdx) {
it->featureIdx = ftIdx;
it->fileIdx = fileIdx;

dinosaurFeatures.push_back(*it);
}
}

DinosaurFeature DinosaurIO::parseDinosaurFeatureRow(std::string& line) {
Expand Down
4 changes: 4 additions & 0 deletions src/DinosaurIO.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ class DinosaurIO {
static void setJavaNumThreads(int numThreads) {
javaNumThreads_ = numThreads;
}
static void setSeed(int seed) {
seed_ = seed;
}

static void parseDinosaurFeatureFile(std::istream& dataStream, int fileIdx, DinosaurFeatureList& dinosaurFeatures);
static int runDinosaurGlobal(const std::string& outputDir, const std::string& mzMLFile);
Expand All @@ -51,6 +54,7 @@ class DinosaurIO {

static std::string javaMemory_;
static int javaNumThreads_;
static int seed_;


};
Expand Down
17 changes: 12 additions & 5 deletions src/FeatureAlignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,16 @@ void FeatureAlignment::matchFeatures(
AlignRetention& alignRetention,
std::vector<DinosaurFeatureList>& allFeatures) {
std::vector<std::pair<int, FilePair> >::const_iterator filePairIt;
for (filePairIt = featureAlignmentQueue.begin();
filePairIt != featureAlignmentQueue.end(); ++filePairIt) {
size_t alignmentCnt = 1u;
for (filePairIt = featureAlignmentQueue.begin();
filePairIt != featureAlignmentQueue.end(); ++filePairIt, ++alignmentCnt) {
FilePair filePair = filePairIt->second;

if (Globals::VERB > 1) {
std::cerr << "Matching features " << filePair.fileIdx1
<< "->" << filePair.fileIdx2 << std::endl;
std::cerr << "Matching features " << filePair.fileIdx1
<< "->" << filePair.fileIdx2
<< " (" << alignmentCnt << "/"
<< featureAlignmentQueue.size() << ")" << std::endl;
}
std::string targetMzMLFile = fileList.getFilePath(filePair.fileIdx2);
getFeatureMap(filePair, targetMzMLFile,
Expand Down Expand Up @@ -150,7 +153,11 @@ void FeatureAlignment::matchFeatures(
percolatorArgs.push_back(percolatorOutputFile);
percolatorArgs.push_back("--decoy-results-psms");
percolatorArgs.push_back(percolatorOutputFile + ".decoys");

/*
percolatorArgs.push_back("--tab-out");
percolatorArgs.push_back(percolatorOutputFile + ".pin");
*/

PercolatorAdapter percolatorAdapter;
percolatorAdapter.parseOptions(percolatorArgs);
percolatorAdapter.init(kLinkFeatureNames);
Expand Down
23 changes: 20 additions & 3 deletions src/Quandenser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

namespace quandenser {

Quandenser::Quandenser() : call_(""), fnPrefix_("Quandenser"),
Quandenser::Quandenser() : call_(""), fnPrefix_("Quandenser"), seed_(1u),
maxMissingValues_(-1), intensityScoreThreshold_(0.5),
spectrumBatchFileFN_(""), outputFolder_("Quandenser_output"),
outputSpectrumFile_(""), maraclusterPpmTol_(20.0f),
Expand Down Expand Up @@ -85,6 +85,10 @@ bool Quandenser::parseOptions(int argc, char **argv) {
"output-folder",
"Writable folder for output files (default: 'Quandenser_output')",
"path");
cmd.defineOption("s",
"seed",
"Seed for pseudo random number generators of Dinosaur and Percolator (default: 1)",
"int");
cmd.defineOption("a",
"prefix",
"Output files will be prefixed as e.g. <prefix>.clusters_p10.tsv (default: 'Quandenser')",
Expand Down Expand Up @@ -182,7 +186,14 @@ bool Quandenser::parseOptions(int argc, char **argv) {
if (cmd.optionSet("output-folder")) {
outputFolder_ = cmd.options["output-folder"];
}


if (cmd.optionSet("seed")) {
int seed = cmd.getInt("seed", 1, 20000); // [1,20000] are Percolator's limits for the seed
percolatorArgs_.push_back("--seed");
percolatorArgs_.push_back(cmd.options["seed"]);
DinosaurIO::setSeed(seed);
}

if (cmd.optionSet("spec-out")) {
outputSpectrumFile_ = cmd.options["spec-out"];
}
Expand Down Expand Up @@ -362,9 +373,15 @@ int Quandenser::run() {
std::cerr << "Error: could not create output directory at " << outputFolder_ << std::endl;
return EXIT_FAILURE;
}

maracluster::SpectrumFileList fileList;
fileList.initFromFile(spectrumBatchFileFN_);


if (fileList.size() <= 2u) {
std::cerr << "Error: less than 2 spectrum files were specified, Quandenser needs at least two files to perform a meaningful alignment." << std::endl;
return EXIT_FAILURE;
}

boost::filesystem::path dinosaurFolder(rootPath);
dinosaurFolder /= "dinosaur";
boost::filesystem::create_directories(dinosaurFolder, returnedError);
Expand Down
2 changes: 1 addition & 1 deletion src/Quandenser.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class Quandenser {
std::string outputFolder_;
std::string outputSpectrumFile_;
std::string fnPrefix_;

int seed_;
int maxMissingValues_;
float intensityScoreThreshold_;

Expand Down

0 comments on commit db9af7c

Please sign in to comment.