Skip to content

Commit

Permalink
Added intermediate writing of features between alignments
Browse files Browse the repository at this point in the history
  • Loading branch information
MatthewThe committed Sep 4, 2019
1 parent db9af7c commit df79378
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 54 deletions.
18 changes: 17 additions & 1 deletion src/DinosaurFeatureList.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

#include <boost/unordered/unordered_map.hpp>

#include "percolator/src/DataSet.h"
#include "maracluster/src/BinaryInterface.h"

#include "Globals.h"
#include "DinosaurFeature.h"
Expand Down Expand Up @@ -89,6 +89,22 @@ class DinosaurFeatureList {
}
}

size_t loadFromFile(const std::string& ftFile) {
std::vector<DinosaurFeature> addedFts;
maracluster::BinaryInterface::read(ftFile, addedFts);
for (DinosaurFeature& df : addedFts) {
if (getFeatureIdx(df) == -1) { // this mimicks the MBR feature adding
df.featureIdx = size(); // re-index before adding to the featurelist
push_back(df);
}
}
return addedFts.size();
}

void saveToFile(const std::string& ftFile, bool append) {
maracluster::BinaryInterface::write(features_, ftFile, append);
}

inline void sortByPrecMz() { std::sort(features_.begin(), features_.end(), lessPrecMz); }
inline void sortByFeatureIdx() { std::sort(features_.begin(), features_.end(), lessFeatureIdx); }

Expand Down
58 changes: 42 additions & 16 deletions src/FeatureAlignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ void FeatureAlignment::matchFeatures(
const std::vector<std::pair<int, FilePair> >& featureAlignmentQueue,
const maracluster::SpectrumFileList& fileList,
AlignRetention& alignRetention,
std::vector<DinosaurFeatureList>& allFeatures) {
std::vector<DinosaurFeatureList>& allFeatures,
const std::string& addedFeaturesFile) {
std::vector<std::pair<int, FilePair> >::const_iterator filePairIt;
size_t alignmentCnt = 1u;
for (filePairIt = featureAlignmentQueue.begin();
Expand All @@ -62,15 +63,17 @@ void FeatureAlignment::matchFeatures(
getFeatureMap(filePair, targetMzMLFile,
alignRetention.getAlignment(filePair),
allFeatures.at(filePair.fileIdx1),
allFeatures.at(filePair.fileIdx2));
allFeatures.at(filePair.fileIdx2),
addedFeaturesFile);
}
}

void FeatureAlignment::getFeatureMap(FilePair& filePair,
const std::string& targetMzMLFile,
SplineRegression& alignment,
DinosaurFeatureList& featuresQueryRun,
DinosaurFeatureList& featuresTargetRun) {
DinosaurFeatureList& featuresTargetRun,
const std::string& addedFeaturesFile) {
std::vector<double> rTimesQueryRun, predictedRTimesTargetRun;
DinosaurFeatureList::const_iterator ftIt;
for (ftIt = featuresQueryRun.begin(); ftIt != featuresQueryRun.end(); ++ftIt) {
Expand All @@ -88,7 +91,7 @@ void FeatureAlignment::getFeatureMap(FilePair& filePair,
predictedRTimesTargetRun, rTimeStdev);
}
addFeatureLinks(percolatorOutputFile, candidateFeaturesTargetRun,
featuresTargetRun, featureMatches_[filePair]);
featuresTargetRun, featureMatches_[filePair], addedFeaturesFile);

if (linkPEPMbrSearchThreshold_ < 1.0) {
std::string percolatorMbrOutputFile = percolatorOutputFileBaseFN_ +
Expand All @@ -101,39 +104,52 @@ void FeatureAlignment::getFeatureMap(FilePair& filePair,
rTimeStdev, candidateFeaturesTargetRun);

addFeatureLinks(percolatorMbrOutputFile, candidateFeaturesTargetRun,
featuresTargetRun, featureMatches_[filePair]);
featuresTargetRun, featureMatches_[filePair], addedFeaturesFile);
}

insertPlaceholderFeatures(featuresQueryRun, predictedRTimesTargetRun,
featuresTargetRun, featureMatches_[filePair]);

featuresTargetRun.sortByPrecMz();
featuresTargetRun, featureMatches_[filePair], addedFeaturesFile);
}

void FeatureAlignment::insertPlaceholderFeatures(
const DinosaurFeatureList& featuresQueryRun,
const std::vector<double>& predictedRTimesTargetRun,
DinosaurFeatureList& featuresTargetRun,
std::map<int, FeatureIdxMatch>& precursorLinks) {
std::map<int, FeatureIdxMatch>& precursorLinks,
const std::string& addedFeaturesFile) {
DinosaurFeatureList::const_iterator queryIt;
std::vector<double>::const_iterator predRTimeIt;
int targetFileIdx = featuresTargetRun.begin()->fileIdx;
size_t placeholdersAdded = 0u;
DinosaurFeatureList placeholderFeatures;
for (queryIt = featuresQueryRun.begin(), predRTimeIt = predictedRTimesTargetRun.begin();
queryIt != featuresQueryRun.end(); ++queryIt, ++predRTimeIt) {
if (precursorLinks[queryIt->featureIdx].posteriorErrorProb > linkPEPThreshold_) {
DinosaurFeature placeholderFt = *queryIt;
placeholderFt.fileIdx = targetFileIdx;
placeholderFt.featureIdx = featuresTargetRun.size();
placeholderFt.intensity = 0.0;
placeholderFt.rTime = *predRTimeIt;
featuresTargetRun.push_back(placeholderFt);
placeholderFt.featureIdx = featuresTargetRun.getFeatureIdx(placeholderFt);
placeholderFt.intensity = 0.0;
if (placeholderFt.featureIdx < 0) {
placeholderFt.featureIdx = featuresTargetRun.size();
featuresTargetRun.push_back(placeholderFt);
if (!addedFeaturesFile.empty()) {
placeholderFeatures.push_back(placeholderFt);
}
}
precursorLinks[queryIt->featureIdx] =
FeatureIdxMatch(placeholderFt.featureIdx, 1.0, placeholderFt.intensity);
++placeholdersAdded;
}
}


if (addedFeaturesFile.empty()) {
featuresTargetRun.sortByPrecMz();
} else {
bool append = true;
placeholderFeatures.saveToFile(addedFeaturesFile, append);
}

if (Globals::VERB > 1) {
std::cerr << "Added " << placeholdersAdded << " placeholders." << std::endl;
}
Expand Down Expand Up @@ -197,12 +213,14 @@ void FeatureAlignment::matchFeatures(
void FeatureAlignment::addFeatureLinks(const std::string& percolatorOutputFile,
DinosaurFeatureList& candidateFeaturesTargetRun,
DinosaurFeatureList& featuresTargetRun,
std::map<int, FeatureIdxMatch>& precursorLinks) {
std::map<int, FeatureIdxMatch>& precursorLinks,
const std::string& addedFeaturesFile) {
std::ifstream dataStream(percolatorOutputFile.c_str(), ios::in);

std::cerr << "Links before "<< precursorLinks.size() << std::endl;

int targetFileIdx = featuresTargetRun.begin()->fileIdx;
DinosaurFeatureList addedTargetFeatures;
std::string psmLine;
getline(dataStream, psmLine); /* skip header */
while (getline(dataStream, psmLine)) {
Expand All @@ -226,16 +244,24 @@ void FeatureAlignment::addFeatureLinks(const std::string& percolatorOutputFile,
getCandidatePos(targetFt.featureIdx));
candFt.featureIdx = featuresTargetRun.getFeatureIdx(candFt);
if (candFt.featureIdx < 0) {
candFt.featureIdx = featuresTargetRun.size();
candFt.featureIdx = featuresTargetRun.size() + addedTargetFeatures.size();
candFt.fileIdx = targetFileIdx;
featuresTargetRun.push_back(candFt);
if (!addedFeaturesFile.empty()) {
addedTargetFeatures.push_back(candFt);
}
}
targetFt = candFt;
}
precursorLinks[queryFt.featureIdx] = FeatureIdxMatch(targetFt.featureIdx,
posteriorErrorProb, targetFt.intensity);
}
}

if (!addedFeaturesFile.empty()) {
bool append = false;
addedTargetFeatures.saveToFile(addedFeaturesFile, append);
}

std::cerr << "Links after "<< precursorLinks.size() << std::endl;
}
Expand Down
12 changes: 8 additions & 4 deletions src/FeatureAlignment.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ class FeatureAlignment {
const std::vector<std::pair<int, FilePair> >& featureAlignmentQueue,
const maracluster::SpectrumFileList& fileList,
AlignRetention& alignRetention,
std::vector<DinosaurFeatureList>& allFeatures);
std::vector<DinosaurFeatureList>& allFeatures,
const std::string& addedFeaturesFile);

std::map<FilePair, std::map<int, FeatureIdxMatch> >& getFeatureMatches() {
return featureMatches_;
Expand All @@ -97,13 +98,15 @@ class FeatureAlignment {
const std::string& targetMzMLFile,
SplineRegression& alignment,
DinosaurFeatureList& featuresQueryRun,
DinosaurFeatureList& featuresTargetRun);
DinosaurFeatureList& featuresTargetRun,
const std::string& addedFeaturesFile);

void insertPlaceholderFeatures(
const DinosaurFeatureList& featuresQueryRun,
const std::vector<double>& predictedRTimesTargetRun,
DinosaurFeatureList& featuresTargetRun,
std::map<int, FeatureIdxMatch>& precursorLinks);
std::map<int, FeatureIdxMatch>& precursorLinks,
const std::string& addedFeaturesFile);

void matchFeatures(
const std::string& percolatorOutputFile,
Expand All @@ -115,7 +118,8 @@ class FeatureAlignment {
void addFeatureLinks(const std::string& percolatorOutputFile,
DinosaurFeatureList& candidateFeaturesTargetRun,
DinosaurFeatureList& featuresTargetRun,
std::map<int, FeatureIdxMatch>& precursorLinks);
std::map<int, FeatureIdxMatch>& precursorLinks,
const std::string& addedFeaturesFile);

void findMatches(int label, double rTimeTol,
const DinosaurFeatureList& featuresTargetRun,
Expand Down
88 changes: 60 additions & 28 deletions src/Quandenser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,20 @@ void Quandenser::runMaRaCluster(const std::string& maRaClusterSubFolder,
clusterFilePath = maraclusterAdapter.getClusterFileName();
}

std::string Quandenser::getFeatureFN(
const boost::filesystem::path& featureOutFile, size_t fileIdx) {
return featureOutFile.string() + "." +
boost::lexical_cast<std::string>(fileIdx) + ".dat";
}

std::string Quandenser::getFeatureFN(
const boost::filesystem::path& featureOutFile,
size_t fileIdx1, size_t fileIdx2) {
return featureOutFile.string() + "." +
boost::lexical_cast<std::string>(fileIdx1) + "_" +
boost::lexical_cast<std::string>(fileIdx2) + ".dat";
}

int Quandenser::run() {
time_t startTime;
clock_t startClock;
Expand All @@ -377,7 +391,7 @@ int Quandenser::run() {
maracluster::SpectrumFileList fileList;
fileList.initFromFile(spectrumBatchFileFN_);

if (fileList.size() <= 2u) {
if (fileList.size() <= 2u && !parallel_1_) {
std::cerr << "Error: less than 2 spectrum files were specified, Quandenser needs at least two files to perform a meaningful alignment." << std::endl;
return EXIT_FAILURE;
}
Expand Down Expand Up @@ -425,7 +439,7 @@ int Quandenser::run() {
}

boost::filesystem::path featureOutFile(outputFolder_);
featureOutFile /= "dinosaur/allFeatures";
featureOutFile /= "dinosaur/features";

std::vector<DinosaurFeatureList> allFeatures;
if (!parallel_3_ && !parallel_4_) {
Expand All @@ -447,9 +461,9 @@ int Quandenser::run() {
std::cout << "Parallel 2: Saving dinosaur features" << std::endl;
for (size_t allFeaturesIdx = 0u; allFeaturesIdx < allFeatures.size(); allFeaturesIdx++) {
DinosaurFeatureList ftList = allFeatures.at(allFeaturesIdx);
std::string featureListFile = getFeatureFN(featureOutFile, allFeaturesIdx);
bool append = false;
std::string featureListFile(featureOutFile.string() + "." + boost::lexical_cast<std::string>(allFeaturesIdx) + ".dat");
maracluster::BinaryInterface::write(ftList.getFeatureList(), featureListFile, append);
ftList.saveToFile(featureListFile, append);
}
std::cout << "Parallel 2: Save completed" << std::endl;
}
Expand All @@ -459,9 +473,16 @@ int Quandenser::run() {
for (std::vector<boost::filesystem::path>::const_iterator it = dinosaurFeatureFiles.begin(); it != dinosaurFeatureFiles.end(); ++it) {
DinosaurFeatureList ftList;
allFeatures.push_back(ftList);
std::string featureListFile(featureOutFile.string() + "." + boost::lexical_cast<std::string>(fileIdx) + ".dat");
maracluster::BinaryInterface::read(featureListFile, allFeatures.at(fileIdx).getFeatureList());
std::cerr << "Read in " << allFeatures.at(fileIdx).size() << " features from " << featureListFile << std::endl;
boost::filesystem::path ftFilePath(fileList.getFilePath(fileIdx));
std::string fn(ftFilePath.filename().string());
// Check if the pair assigned exists in the pair folder
if (boost::filesystem::exists("pair/file1/" + fn) ||
boost::filesystem::exists("pair/file2/" + fn) ||
parallel_4_) {
std::string featureListFile = getFeatureFN(featureOutFile, fileIdx);
size_t ftsAdded = allFeatures.at(fileIdx).loadFromFile(featureListFile);
std::cerr << "Read in " << allFeatures.at(fileIdx).size() << " features from " << featureListFile << std::endl;
}
++fileIdx;
}
std::cout << "Parallel 3/4: Loading completed" << std::endl;
Expand All @@ -476,13 +497,14 @@ int Quandenser::run() {
runMaRaCluster(maraclusterSubFolder, fileList, allFeatures, clusterFilePath, spectrumToPrecursorMap);

AlignRetention alignRetention;
std::ifstream fileStream(clusterFilePath.c_str(), ios::in);
MaRaClusterIO::parseClustersForRTimePairs(fileStream, fileList, spectrumToPrecursorMap, alignRetention.getRTimePairsRef());

boost::filesystem::path featureAlignFile(outputFolder_);
featureAlignFile /= "maracluster/featureAlignmentQueue.txt";
std::vector<std::pair<int, FilePair> > featureAlignmentQueue;
if (!parallel_3_ || parallel_3_ == 99999 || !parallel_4_) {
std::string addedFeaturesFile = "";
if (!parallel_3_ && !parallel_4_) {
std::ifstream fileStream(clusterFilePath.c_str(), ios::in);
MaRaClusterIO::parseClustersForRTimePairs(fileStream, fileList, spectrumToPrecursorMap, alignRetention.getRTimePairsRef());

alignRetention.getAlignModelsTree();
alignRetention.createMinDepthTree(featureAlignmentQueue);

Expand Down Expand Up @@ -516,28 +538,40 @@ int Quandenser::run() {
// Load state of featureAlignmentQueue
std::ifstream infile(featureAlignFile.string().c_str(), ios::in);
int round;
string file1;
string file2;
int fileidx1;
int fileidx2;
std::string tmp1, tmp2;
int fileidx1, fileidx2;
int loop_counter = 0; // Used for parallelization
while (infile >> round >> file1 >> file2 >> fileidx1 >> fileidx2) {
std::vector<bool> featuresAdded(fileList.size());
while (infile >> round >> tmp1 >> tmp2 >> fileidx1 >> fileidx2) {
FilePair filePair(fileidx1, fileidx2);

boost::filesystem::path file1(fileList.getFilePath(fileidx1));
boost::filesystem::path file2(fileList.getFilePath(fileidx2));
std::string fn1(file1.filename().string());
std::string fn2(file2.filename().string());

// Parallel_3 injection. Will only run a pair if it exists in pair/ folder and is as assigned round
// NOTE: In AlignRetention.cpp, line 197: A run is limited by which round it is, needs to be in order
boost::filesystem::path file1(fileList.getFilePath(filePair.fileIdx1));
boost::filesystem::path file2(fileList.getFilePath(filePair.fileIdx2));
// Check if the pair assigned exists in the pair folder
if ((boost::filesystem::exists("pair/file1/" + file1.filename().string()) &&
boost::filesystem::exists("pair/file2/" + file2.filename().string())) ||
loop_counter < parallel_3_ - 1) { // -1 because parallel_3_ is added +1 from actual depth
if (parallel_4_) {
featureAlignmentQueue.push_back(std::make_pair(round, filePair));
} else if (boost::filesystem::exists("pair/file1/" + fn1) &&
boost::filesystem::exists("pair/file2/" + fn2)) {
featureAlignmentQueue.push_back(std::make_pair(round, filePair));
addedFeaturesFile = getFeatureFN(featureOutFile, fileidx1, fileidx2);
if (featuresAdded[fileidx1]) allFeatures.at(fileidx1).sortByPrecMz();
if (featuresAdded[fileidx2]) allFeatures.at(fileidx2).sortByPrecMz();
} else if (loop_counter < parallel_3_ - 1 && // -1 because parallel_3_ is added +1 from actual depth
(boost::filesystem::exists("pair/file1/" + fn2) ||
boost::filesystem::exists("pair/file2/" + fn2))) {
std::string savedAddedFeaturesFile = getFeatureFN(featureOutFile, fileidx1, fileidx2);
size_t addedFts = allFeatures.at(fileidx2).loadFromFile(savedAddedFeaturesFile);
std::cerr << "Read in " << addedFts << " features from " << savedAddedFeaturesFile << std::endl;
featuresAdded[fileidx2] = true;
}
loop_counter++;
}
infile.close();

// Load state of alignRetention
alignRetention.LoadState();
std::cout << "Parallel 3/4: Loading completed" << std::endl;
Expand All @@ -561,11 +595,9 @@ int Quandenser::run() {
percolatorOutputFileBaseFN, percolatorArgs_,
alignPpmTol_, alignRTimeStdevTol_, linkPEPThreshold_,
linkPEPMbrSearchThreshold_, maxFeatureCandidates_);

// Note: You COULD parallelize this (I've tested it), but since the mapping is so fast, it is not worth it
// when running on a cluster, since you have to submit a job for each process.
// Note on previous note: If there are large files, this is also slow, so why not parallelize it too? :)
featureAlignment.matchFeatures(featureAlignmentQueue, fileList, alignRetention, allFeatures);

featureAlignment.matchFeatures(featureAlignmentQueue, fileList, alignRetention, allFeatures, addedFeaturesFile);

if (parallel_3_) { // parallel 3 runs dinosaur.
std::cout << "Parallel stop 3 reached" << std::endl;
return EXIT_SUCCESS;
Expand Down
11 changes: 6 additions & 5 deletions src/Quandenser.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@

#include "maracluster/src/Globals.h"
#include "maracluster/src/SpectrumFileList.h"
#include "maracluster/src/BinaryInterface.h"

#include "Globals.h"
#include "Option.h"
Expand Down Expand Up @@ -83,10 +82,12 @@ class Quandenser {
const std::vector<DinosaurFeatureList>& allFeatures,
std::string& clusterFilePath,
SpectrumToPrecursorMap& spectrumToPrecursorMap);

void loadFeatures(const std::string& featureOutFile,
std::vector<DinosaurFeatureList>& allFeatures);


std::string getFeatureFN(const boost::filesystem::path& featureOutFile,
size_t fileIdx);
std::string getFeatureFN(const boost::filesystem::path& featureOutFile,
size_t fileIdx1, size_t fileIdx2);

std::vector<std::string> maraclusterArgs_;
std::vector<std::string> percolatorArgs_;
};
Expand Down

0 comments on commit df79378

Please sign in to comment.