-
Notifications
You must be signed in to change notification settings - Fork 0
/
pathmapper.cpp
402 lines (365 loc) · 11.1 KB
/
pathmapper.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
/*
* Copyright (C) 2015 by Glenn Hickey (hickey@soe.ucsc.edu)
*
* Released under the MIT license, see LICENSE.cactus
*/
#include <vector>
#include <sstream>
#include <cassert>
#include <algorithm>
#include <stack>
#include "pathmapper.h"
#include "pathspanner.h"
using namespace std;
using namespace vg;
PathMapper::PathMapper() : _sg(0), _lookup(0), _vg(0)
{
}
PathMapper::~PathMapper()
{
delete _sg;
delete _lookup;
}
void PathMapper::init(const VGLight* vg)
{
assert(vg != NULL);
_vg = vg;
delete _sg;
_sg = new SideGraph();
_seqStrings.clear();
_sgPaths.clear();
_sgSeqToVGPathID.clear();
_spanningPaths.clear();
delete _lookup;
_lookup = new SGLookup();
_nodeIDMap.clear();
// lookup structure uses string names (relic from hal2sg sequences)
// here we are mapping node coordinates, so just use nodeId
// (note these strings aren't really used for much)
vector<string> nodeNames;
const VGLight::NodeSet& nodeSet = _vg->getNodeSet();
for (VGLight::NodeSet::const_iterator i = nodeSet.begin();
i != nodeSet.end(); ++i)
{
stringstream ss;
ss << (*i)->id();
nodeNames.push_back(ss.str());
// make sure we can index our nodes with some number <= numNodes
_nodeIDMap.insert(pair<int64_t, sg_int_t>((*i)->id(), _nodeIDMap.size()));
}
_lookup->init(nodeNames);
// keep all vg paths indexed by name and id
_pathNames.clear();
_pathIDs.clear();
}
string PathMapper::getSideGraphDNA(sg_int_t seqID, sg_int_t offset,
sg_int_t length, bool reversed) const
{
assert(seqID >= 0 && seqID < _seqStrings.size());
if (length == -1)
{
length = _seqStrings[seqID].length();
}
string dna = _seqStrings[seqID].substr(offset, length);
if (reversed)
{
VGLight::reverseComplement(dna);
}
return dna;
}
string PathMapper::getSideGraphPathDNA(const string& pathName) const
{
string outString;
const vector<SGSegment>& path = getSideGraphPath(pathName);
size_t pathLen = 0;
(void)pathLen;
for (size_t i = 0; i < path.size(); ++i)
{
outString += getSideGraphDNA(path[i].getSide().getBase().getSeqID(),
path[i].getMinPos().getPos(),
path[i].getLength(),
!path[i].getSide().getForward());
pathLen += path[i].getLength();
}
assert(pathLen == outString.length());
return outString;
}
const vector<SGSegment>& PathMapper::getSideGraphPath(const string& pathName)
const
{
return _sgPaths[getPathID(pathName)];
}
const string& PathMapper::getVGPathName(const SGSequence* seq) const
{
return _pathNames[_sgSeqToVGPathID[seq->getID()]];
}
void PathMapper::addPath(const std::string& pathName,
const VGLight::MappingList& mappings)
{
assert(_pathIDs.find(pathName) == _pathIDs.end());
_pathNames.push_back(pathName);
_pathIDs.insert(pair<string, sg_int_t>(pathName, _pathIDs.size()));
_curSeq = NULL;
sg_int_t pathID = getPathID(pathName);
sg_int_t pathPos = 0;
size_t mappingCount = 0;
for (VGLight::MappingList::const_iterator i = mappings.begin();
i != mappings.end(); ++i, ++mappingCount)
{
Position pos = i->position();
bool reversed = pos.is_reverse();
sg_int_t segmentLength = _vg->getSegmentLength(*i);
// we never want to only convert a partial node. this is
// enforced at the beginning and end of paths here:
// (assumption: offset always relative to forward position 0)
const Node* node = _vg->getNode(pos.node_id());
size_t nodeLen = node->sequence().length();
int64_t offset = pos.offset();
// convert to forward-offset, as that is what vg used when logic was written
if (reversed) {
offset = nodeLen - 1 - offset;
}
if (!reversed)
{
// clamp forward starting point to 0
if (i == mappings.begin() && offset > 0)
{
segmentLength += offset;
pos.set_offset(0);
}
// clamp forward end point to len-1
if (i == --mappings.end() && offset + segmentLength < nodeLen)
{
segmentLength += nodeLen - (offset + segmentLength);
}
}
else
{
// clamp reverse starting point to len-1
if (i == mappings.begin() && offset < nodeLen - 1)
{
segmentLength += nodeLen - 1 - offset;
pos.set_offset(0);
}
// clamp reverse ending point to 0
if (i == --mappings.end() && offset - segmentLength > 0)
{
segmentLength = offset;
}
}
addSegment(pathID, pathPos, pos, reversed, segmentLength);
pathPos += segmentLength;
}
if (_curSeq != NULL)
{
_sg->addSequence(_curSeq);
}
_curSeq = NULL;
addPathJoins(pathName, mappings);
}
void PathMapper::addSpanningPaths()
{
if (_spanningPaths.size() > 0)
{
throw runtime_error("Spanning paths already added");
}
if (_vg->getNodeSet().empty())
{
return;
}
PathSpanner ps;
ps.init(_vg);
while (ps.hasNextPath() == true)
{
string pathName = getSpanningPathName();
VGLight::MappingList mappings;
ps.getNextPath(mappings);
addPath(pathName, mappings);
_spanningPaths.insert(pair<sg_int_t, VGLight::MappingList>(
_pathNames.size()-1, mappings));
}
}
void PathMapper::verifyPaths() const
{
for (int i = 0; i < _pathNames.size(); ++i)
{
string sgDNA = getSideGraphPathDNA(_pathNames[i]);
string vgDNA;
if (!isSpanningPath(i))
{
_vg->getPathDNA(_pathNames[i], vgDNA);
}
else
{
_vg->getPathDNA(_spanningPaths.find(i)->second, vgDNA);
}
if (vgDNA != sgDNA)
{
stringstream ss;
ss << "Verification failed for VG path " << _pathNames[i] << ": "
<< "output DNA differs from input. Please report this bug";
throw runtime_error(ss.str());
}
}
}
void PathMapper::addSegment(sg_int_t pathID, sg_int_t pathPos,
const Position& pos, bool reversed,
sg_int_t segLength)
{
const Node* node = _vg->getNode(pos.node_id());
// convert vg offset to forward relative, like it used to be
int64_t offset = pos.offset();
if (pos.is_reverse()) {
offset = node->sequence().length() - 1 - offset;
}
// when mapping, our "from" coordinate is node-relative
sg_int_t sgNodeID = _nodeIDMap.find(pos.node_id())->second;
SGPosition sgPos(sgNodeID, offset);
SGSide mapResult = _lookup->mapPosition(sgPos);
bool found = mapResult.getBase() != SideGraph::NullPos;
if (!found)
{
// CASE 1) : Create new SG Sequence
if (_curSeq == NULL)
{
_curSeq = new SGSequence(_sg->getNumSequences(), 0,
makeSeqName(pathID, pathPos));
assert(_curSeq->getID() == _seqStrings.size());
_seqStrings.push_back("");
_sgSeqToVGPathID.push_back(_pathNames.size()-1);
}
// CASE 2) : Extend existing SG Sequence
sg_int_t curSeqLen = _curSeq->getLength();
assert(_curSeq->getID() < _seqStrings.size());
int64_t start = !reversed ? offset : offset - segLength + 1;
string dna = node->sequence().substr(start, segLength);
if (reversed == true)
{
VGLight::reverseComplement(dna);
}
assert(reversed || offset + segLength <= dna.length());
assert(!reversed || offset - segLength + 1 >= 0);
// add dna string to the sequence
_seqStrings[_curSeq->getID()].append(dna);
_curSeq->setLength(_curSeq->getLength() + segLength);
assert(_curSeq->getLength() == _seqStrings[_curSeq->getID()].length());
// update lookup
SGPosition toPos(_curSeq->getID(), curSeqLen);
SGPosition fromPos(sgPos.getSeqID(), !reversed ? sgPos.getPos() :
sgPos.getPos() - segLength + 1);
_lookup->addInterval(fromPos, toPos, segLength, reversed);
}
else
{
// CASE 3) : Segment maps to existing SG Sequence
if (_curSeq != NULL)
{
_sg->addSequence(_curSeq);
}
_curSeq = NULL;
}
}
// doesn't really need a second pass but whatever
void PathMapper::addPathJoins(const string& name,
const VGLight::MappingList& mappings)
{
_sgPaths.push_back(vector<SGSegment>());
vector<SGSegment>& sgPath = _sgPaths.back();
int mappingCount = 0;
for (VGLight::MappingList::const_iterator i = mappings.begin();
i != mappings.end(); ++i, ++mappingCount)
{
const Position& pos = i->position();
bool reversed = pos.is_reverse();
const Node* node = _vg->getNode(pos.node_id());
int64_t segmentLength = _vg->getSegmentLength(*i);
int64_t offset = pos.offset();
// convert vg offset to forward relative, like it used to be
if (reversed) {
offset = node->sequence().length() - 1 - offset;
}
assert(offset >= 0);
// do some sanity checks on startpoints
if (i != mappings.begin() &&
((!reversed && offset > 0) ||
(reversed && offset != node->sequence().length() - 1)))
{
stringstream ss;
ss << "Path " << name << " Mapping rank " << (mappingCount + 1) << ": ";
if (reversed)
{
ss << "(Reverse) ";
}
ss << "Mapping with offset " << offset
<< " does not start at node " << node->id() << " endpoint.";
throw runtime_error(ss.str());
}
// and endpoints
if (i != --mappings.end() &&
((!reversed && offset + segmentLength != node->sequence().length())
|| (reversed && offset - segmentLength + 1 != 0)))
{
stringstream ss;
ss << "Path " << name << " Mapping rank " << (mappingCount + 1) << ": ";
if (reversed)
{
ss << "(Reverse) ";
}
ss << "Mapping with offset " << offset << " and length " << segmentLength
<< " does not end at endpoint of node " << node->id() << " with length "
<< node->sequence().length();
throw runtime_error(ss.str());
}
sg_int_t nodeID = _nodeIDMap.find(node->id())->second;
SGPosition start(nodeID, offset);
vector<SGSegment> nextPath;
_lookup->getPath(start, segmentLength, !reversed, nextPath);
mergePaths(sgPath, nextPath);
}
for (size_t i = 1; i < sgPath.size(); ++i)
{
SGSide srcSide = sgPath[i-1].getOutSide();
SGSide tgtSide = sgPath[i].getInSide();
SGJoin* join = new SGJoin(srcSide, tgtSide);
if (!join->isTrivial())
{
_sg->addJoin(join);
}
else
{
delete join;
}
}
}
void PathMapper::mergePaths(vector<SGSegment>& prevPath,
const vector<SGSegment>& path) const
{
assert(path.size() > 0);
const SGSegment& nextSeg = path[0];
int i = 0;
if (!prevPath.empty() &&
prevPath.back().getSide().getForward() ==
nextSeg.getSide().getForward() &&
prevPath.back().getOutSide().lengthTo(nextSeg.getInSide()) == 0)
{
prevPath.back().setLength(prevPath.back().getLength() +
nextSeg.getLength());
++i;
}
for (; i < path.size(); ++i)
{
prevPath.push_back(path[i]);
}
}
string PathMapper::makeSeqName(sg_int_t pathID, sg_int_t pathPos)
{
stringstream ss;
ss << getPathName(pathID) << "_" << pathPos;
return ss.str();
}
string PathMapper::getSpanningPathName() const
{
stringstream ss;
ss << "___span_" << _spanningPaths.size();
return ss.str();
}