-
Notifications
You must be signed in to change notification settings - Fork 2
/
BamIndex.cpp
2034 lines (1635 loc) · 73.7 KB
/
BamIndex.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// ***************************************************************************
// BamIndex.cpp (c) 2009 Derek Barnett
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
// Last modified: 21 October 2010 (DB)
// ---------------------------------------------------------------------------
// Provides index functionality - both for the default (standardized) BAM
// index format (.bai) as well as a BamTools-specific (nonstandard) index
// format (.bti).
// ***************************************************************************
#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <map>
#include "BamIndex.h"
#include "BamReader.h"
#include "BGZF.h"
using namespace std;
using namespace BamTools;
// -------------------------------
// BamIndex implementation
// ctor
BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader)
: m_BGZF(bgzf)
, m_reader(reader)
, m_cacheMode(BamIndex::LimitedIndexCaching)
, m_indexStream(0)
{
if ( m_reader && m_reader->IsOpen() )
m_references = m_reader->GetReferenceData();
}
// dtor
BamIndex::~BamIndex(void) {
if ( IsOpen() )
fclose(m_indexStream);
}
// return true if FILE* is open
bool BamIndex::IsOpen(void) const {
return ( m_indexStream != 0 );
}
// loads existing data from file into memory
bool BamIndex::Load(const string& filename) {
// open index file, abort on error
if ( !OpenIndexFile(filename, "rb") ) {
fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
return false;
}
// check magic number
if ( !LoadHeader() ) {
fclose(m_indexStream);
return false;
}
// load reference data (but only keep in memory if full caching requested)
bool saveInitialLoad = ( m_cacheMode == BamIndex::FullIndexCaching );
if ( !LoadAllReferences(saveInitialLoad) ) {
fclose(m_indexStream);
return false;
}
// update index cache based on selected mode
UpdateCache();
// return success
return true;
}
// opens index file for reading/writing, return true if opened OK
bool BamIndex::OpenIndexFile(const string& filename, const string& mode) {
m_indexStream = fopen(filename.c_str(), mode.c_str());
return ( m_indexStream != 0 );
}
// rewind index file to beginning of index data, return true if rewound OK
bool BamIndex::Rewind(void) {
return ( fseek64(m_indexStream, DataBeginOffset(), SEEK_SET) == 0 );
}
// change the index caching behavior
void BamIndex::SetCacheMode(const BamIndexCacheMode mode) {
if ( mode != m_cacheMode ) {
m_cacheMode = mode;
UpdateCache();
}
}
// updates in-memory cache of index data, depending on current cache mode
void BamIndex::UpdateCache(void) {
// skip if file not open
if ( !IsOpen() ) return;
// reflect requested cache mode behavior
switch ( m_cacheMode ) {
case (BamIndex::FullIndexCaching) :
Rewind();
LoadAllReferences(true);
break;
case (BamIndex::LimitedIndexCaching) :
if ( HasFullDataCache() )
KeepOnlyFirstReferenceOffsets();
else {
ClearAllData();
SkipToFirstReference();
LoadFirstReference(true);
}
break;
case(BamIndex::NoIndexCaching) :
ClearAllData();
break;
default :
// unreachable
;
}
}
// writes in-memory index data out to file
bool BamIndex::Write(const string& bamFilename) {
// open index file for writing
string indexFilename = bamFilename + Extension();
if ( !OpenIndexFile(indexFilename, "wb") ) {
fprintf(stderr, "ERROR: Could not open file to save index.\n");
return false;
}
// write index header data
if ( !WriteHeader() ) {
fprintf(stderr, "ERROR: There was a problem writing index metadata to new index file.\n");
fflush(m_indexStream);
fclose(m_indexStream);
exit(1);
}
// write main index data
if ( !WriteAllReferences() ) {
fprintf(stderr, "ERROR: There was a problem writing index data to new index file.\n");
fflush(m_indexStream);
fclose(m_indexStream);
exit(1);
}
// flush any remaining output, rewind file, and return success
fflush(m_indexStream);
fclose(m_indexStream);
// re-open index file for later reading
if ( !OpenIndexFile(indexFilename, "rb") ) {
fprintf(stderr, "ERROR: Could not open newly created index file for reading.\n");
return false;
}
// return success/failure of write
return true;
}
// #########################################################################################
// #########################################################################################
// -------------------------------
// BamStandardIndex structs & typedefs
namespace BamTools {
// BAM index constants
const int MAX_BIN = 37450; // =(8^6-1)/7+1
const int BAM_LIDX_SHIFT = 14;
// --------------------------------------------------
// BamStandardIndex data structures & typedefs
struct Chunk {
// data members
uint64_t Start;
uint64_t Stop;
// constructor
Chunk(const uint64_t& start = 0,
const uint64_t& stop = 0)
: Start(start)
, Stop(stop)
{ }
};
bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
return lhs.Start < rhs.Start;
}
typedef vector<Chunk> ChunkVector;
typedef map<uint32_t, ChunkVector> BamBinMap;
typedef vector<uint64_t> LinearOffsetVector;
struct ReferenceIndex {
// data members
BamBinMap Bins;
LinearOffsetVector Offsets;
bool HasAlignments;
// constructor
ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
const LinearOffsetVector& offsets = LinearOffsetVector(),
const bool hasAlignments = false)
: Bins(binMap)
, Offsets(offsets)
, HasAlignments(hasAlignments)
{ }
};
typedef map<int32_t, ReferenceIndex> BamStandardIndexData;
} // namespace BamTools
// -------------------------------
// BamStandardIndexPrivate implementation
struct BamStandardIndex::BamStandardIndexPrivate {
// parent object
BamStandardIndex* m_parent;
// data members
BamStandardIndexData m_indexData;
off_t m_dataBeginOffset;
bool m_hasFullDataCache;
bool m_isBigEndian;
// ctor & dtor
BamStandardIndexPrivate(BamStandardIndex* parent);
~BamStandardIndexPrivate(void);
// parent interface methods
public:
// creates index data (in-memory) from current reader data
bool Build(void);
// clear all current index offset data in memory
void ClearAllData(void);
// return file position after header metadata
const off_t DataBeginOffset(void) const;
// returns whether reference has alignments or no
bool HasAlignments(const int& referenceID) const;
// return true if all index data is cached
bool HasFullDataCache(void) const;
// attempts to use index to jump to region; returns success/fail
bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
// clears index data from all references except the first reference
void KeepOnlyFirstReferenceOffsets(void);
// load index data for all references, return true if loaded OK
// @saveData - save data in memory if true, just read & discard if false
bool LoadAllReferences(bool saveData = true);
// load first reference from file, return true if loaded OK
// @saveData - save data in memory if true, just read & discard if false
bool LoadFirstReference(bool saveData = true);
// load header data from index file, return true if loaded OK
bool LoadHeader(void);
// position file pointer to first reference begin, return true if skipped OK
bool SkipToFirstReference(void);
// write header to new index file
bool WriteHeader(void);
// write index data for all references to new index file
bool WriteAllReferences(void);
// internal methods
private:
// -----------------------
// index file operations
// check index file magic number, return true if OK
bool CheckMagicNumber(void);
// check index file version, return true if OK
bool CheckVersion(void);
// load a single index bin entry from file, return true if loaded OK
// @saveData - save data in memory if true, just read & discard if false
bool LoadBin(ReferenceIndex& refEntry, bool saveData = true);
bool LoadBins(ReferenceIndex& refEntry, bool saveData = true);
// load a single index bin entry from file, return true if loaded OK
// @saveData - save data in memory if true, just read & discard if false
bool LoadChunk(ChunkVector& chunks, bool saveData = true);
bool LoadChunks(ChunkVector& chunks, bool saveData = true);
// load a single index linear offset entry from file, return true if loaded OK
// @saveData - save data in memory if true, just read & discard if false
bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true);
// load a single reference from file, return true if loaded OK
// @saveData - save data in memory if true, just read & discard if false
bool LoadReference(const int& refId, bool saveData = true);
// loads number of references, return true if loaded OK
bool LoadReferenceCount(int& numReferences);
// position file pointer to desired reference begin, return true if skipped OK
bool SkipToReference(const int& refId);
// write index data for bin to new index file
bool WriteBin(const uint32_t& binId, const ChunkVector& chunks);
// write index data for bins to new index file
bool WriteBins(const BamBinMap& bins);
// write index data for chunk entry to new index file
bool WriteChunk(const Chunk& chunk);
// write index data for chunk entry to new index file
bool WriteChunks(const ChunkVector& chunks);
// write index data for linear offsets entry to new index file
bool WriteLinearOffsets(const LinearOffsetVector& offsets);
// write index data single reference to new index file
bool WriteReference(const ReferenceIndex& refEntry);
// -----------------------
// index data operations
// calculate bins that overlap region
int BinsFromRegion(const BamRegion& region,
const bool isRightBoundSpecified,
uint16_t bins[BamTools::MAX_BIN]);
// clear all index offset data for desired reference
void ClearReferenceOffsets(const int& refId);
// calculates offset(s) for a given region
bool GetOffsets(const BamRegion& region,
const bool isRightBoundSpecified,
vector<int64_t>& offsets,
bool* hasAlignmentsInRegion);
// returns true if index cache has data for desired reference
bool IsDataLoaded(const int& refId) const;
// clears index data from all references except the one specified
void KeepOnlyReferenceOffsets(const int& refId);
// simplifies index by merging 'chunks'
void MergeChunks(void);
// saves BAM bin entry for index
void SaveBinEntry(BamBinMap& binMap,
const uint32_t& saveBin,
const uint64_t& saveOffset,
const uint64_t& lastOffset);
// saves linear offset entry for index
void SaveLinearOffset(LinearOffsetVector& offsets,
const BamAlignment& bAlignment,
const uint64_t& lastOffset);
// initializes index data structure to hold @count references
void SetReferenceCount(const int& count);
};
BamStandardIndex::BamStandardIndexPrivate::BamStandardIndexPrivate(BamStandardIndex* parent)
: m_parent(parent)
, m_dataBeginOffset(0)
, m_hasFullDataCache(false)
{
m_isBigEndian = BamTools::SystemIsBigEndian();
}
BamStandardIndex::BamStandardIndexPrivate::~BamStandardIndexPrivate(void) {
ClearAllData();
}
// calculate bins that overlap region
int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& region,
const bool isRightBoundSpecified,
uint16_t bins[MAX_BIN])
{
// get region boundaries
uint32_t begin = (unsigned int)region.LeftPosition;
uint32_t end;
// if right bound specified AND left&right bounds are on same reference
// OK to use right bound position
if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
end = (unsigned int)region.RightPosition;
// otherwise, use end of left bound reference as cutoff
else
end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1;
// initialize list, bin '0' always a valid bin
int i = 0;
bins[i++] = 0;
// get rest of bins that contain this region
unsigned int k;
for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; }
for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; }
for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; }
for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; }
for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
// return number of bins stored
return i;
}
// creates index data (in-memory) from current reader data
bool BamStandardIndex::BamStandardIndexPrivate::Build(void) {
// localize parent data
if ( m_parent == 0 ) return false;
BamReader* reader = m_parent->m_reader;
BgzfData* mBGZF = m_parent->m_BGZF;
// be sure reader & BGZF file are valid & open for reading
if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
return false;
// move file pointer to beginning of alignments
reader->Rewind();
// get reference count, reserve index space
const int numReferences = (int)m_parent->m_references.size();
m_indexData.clear();
m_hasFullDataCache = false;
SetReferenceCount(numReferences);
// sets default constant for bin, ID, offset, coordinate variables
const uint32_t defaultValue = 0xffffffffu;
// bin data
uint32_t saveBin(defaultValue);
uint32_t lastBin(defaultValue);
// reference ID data
int32_t saveRefID(defaultValue);
int32_t lastRefID(defaultValue);
// offset data
uint64_t saveOffset = mBGZF->Tell();
uint64_t lastOffset = saveOffset;
// coordinate data
int32_t lastCoordinate = defaultValue;
BamAlignment bAlignment;
while ( reader->GetNextAlignmentCore(bAlignment) ) {
// change of chromosome, save ID, reset bin
if ( lastRefID != bAlignment.RefID ) {
lastRefID = bAlignment.RefID;
lastBin = defaultValue;
}
// if lastCoordinate greater than BAM position - file not sorted properly
else if ( lastCoordinate > bAlignment.Position ) {
fprintf(stderr, "BAM file not properly sorted:\n");
fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(),
lastCoordinate, bAlignment.Position, bAlignment.RefID);
exit(1);
}
// if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
// save linear offset entry (matched to BAM entry refID)
BamStandardIndexData::iterator indexIter = m_indexData.find(bAlignment.RefID);
if ( indexIter == m_indexData.end() ) return false; // error
ReferenceIndex& refIndex = (*indexIter).second;
LinearOffsetVector& offsets = refIndex.Offsets;
SaveLinearOffset(offsets, bAlignment, lastOffset);
}
// if current BamAlignment bin != lastBin, "then possibly write the binning index"
if ( bAlignment.Bin != lastBin ) {
// if not first time through
if ( saveBin != defaultValue ) {
// save Bam bin entry
BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID);
if ( indexIter == m_indexData.end() ) return false; // error
ReferenceIndex& refIndex = (*indexIter).second;
BamBinMap& binMap = refIndex.Bins;
SaveBinEntry(binMap, saveBin, saveOffset, lastOffset);
}
// update saveOffset
saveOffset = lastOffset;
// update bin values
saveBin = bAlignment.Bin;
lastBin = bAlignment.Bin;
// update saveRefID
saveRefID = bAlignment.RefID;
// if invalid RefID, break out
if ( saveRefID < 0 ) break;
}
// make sure that current file pointer is beyond lastOffset
if ( mBGZF->Tell() <= (int64_t)lastOffset ) {
fprintf(stderr, "Error in BGZF offsets.\n");
exit(1);
}
// update lastOffset
lastOffset = mBGZF->Tell();
// update lastCoordinate
lastCoordinate = bAlignment.Position;
}
// save any leftover BAM data (as long as refID is valid)
if ( saveRefID >= 0 ) {
// save Bam bin entry
BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID);
if ( indexIter == m_indexData.end() ) return false; // error
ReferenceIndex& refIndex = (*indexIter).second;
BamBinMap& binMap = refIndex.Bins;
SaveBinEntry(binMap, saveBin, saveOffset, lastOffset);
}
// simplify index by merging chunks
MergeChunks();
// iterate through references in index
// sort offsets in linear offset vector
BamStandardIndexData::iterator indexIter = m_indexData.begin();
BamStandardIndexData::iterator indexEnd = m_indexData.end();
for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
// get reference index data
ReferenceIndex& refIndex = (*indexIter).second;
LinearOffsetVector& offsets = refIndex.Offsets;
// sort linear offsets
sort(offsets.begin(), offsets.end());
}
// rewind file pointer to beginning of alignments, return success/fail
return reader->Rewind();
}
// check index file magic number, return true if OK
bool BamStandardIndex::BamStandardIndexPrivate::CheckMagicNumber(void) {
// read in magic number
char magic[4];
size_t elementsRead = fread(magic, sizeof(char), 4, m_parent->m_indexStream);
// compare to expected value
if ( strncmp(magic, "BAI\1", 4) != 0 ) {
fprintf(stderr, "Problem with index file - invalid format.\n");
fclose(m_parent->m_indexStream);
return false;
}
// return success/failure of load
return (elementsRead == 4);
}
// clear all current index offset data in memory
void BamStandardIndex::BamStandardIndexPrivate::ClearAllData(void) {
BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
for ( ; indexIter != indexEnd; ++indexIter ) {
const int& refId = (*indexIter).first;
ClearReferenceOffsets(refId);
}
}
// clear all index offset data for desired reference
void BamStandardIndex::BamStandardIndexPrivate::ClearReferenceOffsets(const int& refId) {
// look up refId, skip if not found
BamStandardIndexData::iterator indexIter = m_indexData.find(refId);
if ( indexIter == m_indexData.end() ) return ;
// clear reference data
ReferenceIndex& refEntry = (*indexIter).second;
refEntry.Bins.clear();
refEntry.Offsets.clear();
// set flag
m_hasFullDataCache = false;
}
// return file position after header metadata
const off_t BamStandardIndex::BamStandardIndexPrivate::DataBeginOffset(void) const {
return m_dataBeginOffset;
}
// calculates offset(s) for a given region
bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& region,
const bool isRightBoundSpecified,
vector<int64_t>& offsets,
bool* hasAlignmentsInRegion)
{
// return false if leftBound refID is not found in index data
if ( m_indexData.find(region.LeftRefID) == m_indexData.end() )
return false;
// load index data for region if not already cached
if ( !IsDataLoaded(region.LeftRefID) ) {
bool loadedOk = true;
loadedOk &= SkipToReference(region.LeftRefID);
loadedOk &= LoadReference(region.LeftRefID);
if ( !loadedOk ) return false;
}
// calculate which bins overlap this region
uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
int numBins = BinsFromRegion(region, isRightBoundSpecified, bins);
// get bins for this reference
BamStandardIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID);
if ( indexIter == m_indexData.end() ) return false; // error
const ReferenceIndex& refIndex = (*indexIter).second;
const BamBinMap& binMap = refIndex.Bins;
// get minimum offset to consider
const LinearOffsetVector& linearOffsets = refIndex.Offsets;
const uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() )
? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
// store all alignment 'chunk' starts (file offsets) for bins in this region
for ( int i = 0; i < numBins; ++i ) {
const uint16_t binKey = bins[i];
map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
// iterate over chunks
const ChunkVector& chunks = (*binIter).second;
std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
for ( ; chunksIter != chunksEnd; ++chunksIter) {
// if valid chunk found, store its file offset
const Chunk& chunk = (*chunksIter);
if ( chunk.Stop > minOffset )
offsets.push_back( chunk.Start );
}
}
}
// clean up memory
free(bins);
// sort the offsets before returning
sort(offsets.begin(), offsets.end());
// set flag & return success
*hasAlignmentsInRegion = (offsets.size() != 0 );
// if cache mode set to none, dump the data we just loaded
if (m_parent->m_cacheMode == BamIndex::NoIndexCaching )
ClearReferenceOffsets(region.LeftRefID);
// return succes
return true;
}
// returns whether reference has alignments or no
bool BamStandardIndex::BamStandardIndexPrivate::HasAlignments(const int& refId) const {
BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId);
if ( indexIter == m_indexData.end() ) return false; // error
const ReferenceIndex& refEntry = (*indexIter).second;
return refEntry.HasAlignments;
}
// return true if all index data is cached
bool BamStandardIndex::BamStandardIndexPrivate::HasFullDataCache(void) const {
return m_hasFullDataCache;
}
// returns true if index cache has data for desired reference
bool BamStandardIndex::BamStandardIndexPrivate::IsDataLoaded(const int& refId) const {
// look up refId, return false if not found
BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId);
if ( indexIter == m_indexData.end() ) return false;
// see if reference has alignments
// if not, it's not a problem to have no offset data
const ReferenceIndex& refEntry = (*indexIter).second;
if ( !refEntry.HasAlignments ) return true;
// return whether bin map contains data
return ( !refEntry.Bins.empty() );
}
// attempts to use index to jump to region; returns success/fail
bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
// localize parent data
if ( m_parent == 0 ) return false;
BamReader* reader = m_parent->m_reader;
BgzfData* mBGZF = m_parent->m_BGZF;
RefVector& references = m_parent->m_references;
// be sure reader & BGZF file are valid & open for reading
if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
return false;
// make sure left-bound position is valid
if ( region.LeftPosition > references.at(region.LeftRefID).RefLength )
return false;
// calculate offsets for this region
// if failed, print message, set flag, and return failure
vector<int64_t> offsets;
if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) {
fprintf(stderr, "ERROR: Could not jump: unable to calculate offset(s) for specified region.\n");
*hasAlignmentsInRegion = false;
return false;
}
// iterate through offsets
BamAlignment bAlignment;
bool result = true;
for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {
// attempt seek & load first available alignment
// set flag to true if data exists
result &= mBGZF->Seek(*o);
*hasAlignmentsInRegion = reader->GetNextAlignmentCore(bAlignment);
// if this alignment corresponds to desired position
// return success of seeking back to the offset before the 'current offset' (to cover overlaps)
if ( ((bAlignment.RefID == region.LeftRefID) &&
((bAlignment.Position + bAlignment.Length) > region.LeftPosition)) ||
(bAlignment.RefID > region.LeftRefID) )
{
if ( o != offsets.begin() ) --o;
return mBGZF->Seek(*o);
}
}
// if error in jumping, print message & set flag
if ( !result ) {
fprintf(stderr, "ERROR: Could not jump: unable to determine correct offset for specified region.\n");
*hasAlignmentsInRegion = false;
}
// return success/failure
return result;
}
// clears index data from all references except the first
void BamStandardIndex::BamStandardIndexPrivate::KeepOnlyFirstReferenceOffsets(void) {
BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
KeepOnlyReferenceOffsets((*indexBegin).first);
}
// clears index data from all references except the one specified
void BamStandardIndex::BamStandardIndexPrivate::KeepOnlyReferenceOffsets(const int& refId) {
BamStandardIndexData::iterator mapIter = m_indexData.begin();
BamStandardIndexData::iterator mapEnd = m_indexData.end();
for ( ; mapIter != mapEnd; ++mapIter ) {
const int entryRefId = (*mapIter).first;
if ( entryRefId != refId )
ClearReferenceOffsets(entryRefId);
}
}
bool BamStandardIndex::BamStandardIndexPrivate::LoadAllReferences(bool saveData) {
// skip if data already loaded
if ( m_hasFullDataCache ) return true;
// get number of reference sequences
uint32_t numReferences;
if ( !LoadReferenceCount((int&)numReferences) )
return false;
// iterate over reference entries
bool loadedOk = true;
for ( int i = 0; i < (int)numReferences; ++i )
loadedOk &= LoadReference(i, saveData);
// set flag
if ( loadedOk && saveData )
m_hasFullDataCache = true;
// return success/failure of loading references
return loadedOk;
}
// load header data from index file, return true if loaded OK
bool BamStandardIndex::BamStandardIndexPrivate::LoadHeader(void) {
bool loadedOk = CheckMagicNumber();
// store offset of beginning of data
m_dataBeginOffset = ftell64(m_parent->m_indexStream);
// return success/failure of load
return loadedOk;
}
// load a single index bin entry from file, return true if loaded OK
// @saveData - save data in memory if true, just read & discard if false
bool BamStandardIndex::BamStandardIndexPrivate::LoadBin(ReferenceIndex& refEntry, bool saveData) {
size_t elementsRead = 0;
// get bin ID
uint32_t binId;
elementsRead += fread(&binId, sizeof(binId), 1, m_parent->m_indexStream);
if ( m_isBigEndian ) SwapEndian_32(binId);
// load alignment chunks for this bin
ChunkVector chunks;
bool chunksOk = LoadChunks(chunks, saveData);
// store bin entry
if ( chunksOk && saveData )
refEntry.Bins.insert(pair<uint32_t, ChunkVector>(binId, chunks));
// return success/failure of load
return ( (elementsRead == 1) && chunksOk );
}
bool BamStandardIndex::BamStandardIndexPrivate::LoadBins(ReferenceIndex& refEntry, bool saveData) {
size_t elementsRead = 0;
// get number of bins
int32_t numBins;
elementsRead += fread(&numBins, sizeof(numBins), 1, m_parent->m_indexStream);
if ( m_isBigEndian ) SwapEndian_32(numBins);
// set flag
refEntry.HasAlignments = ( numBins != 0 );
// iterate over bins
bool binsOk = true;
for ( int i = 0; i < numBins; ++i )
binsOk &= LoadBin(refEntry, saveData);
// return success/failure of load
return ( (elementsRead == 1) && binsOk );
}
// load a single index bin entry from file, return true if loaded OK
// @saveData - save data in memory if true, just read & discard if false
bool BamStandardIndex::BamStandardIndexPrivate::LoadChunk(ChunkVector& chunks, bool saveData) {
size_t elementsRead = 0;
// read in chunk data
uint64_t start;
uint64_t stop;
elementsRead += fread(&start, sizeof(start), 1, m_parent->m_indexStream);
elementsRead += fread(&stop, sizeof(stop), 1, m_parent->m_indexStream);
// swap endian-ness if necessary
if ( m_isBigEndian ) {
SwapEndian_64(start);
SwapEndian_64(stop);
}
// save data if requested
if ( saveData ) chunks.push_back( Chunk(start, stop) );
// return success/failure of load
return ( elementsRead == 2 );
}
bool BamStandardIndex::BamStandardIndexPrivate::LoadChunks(ChunkVector& chunks, bool saveData) {
size_t elementsRead = 0;
// read in number of chunks
uint32_t numChunks;
elementsRead += fread(&numChunks, sizeof(numChunks), 1, m_parent->m_indexStream);
if ( m_isBigEndian ) SwapEndian_32(numChunks);
// initialize space for chunks if we're storing this data
if ( saveData ) chunks.reserve(numChunks);
// iterate over chunks
bool chunksOk = true;
for ( int i = 0; i < (int)numChunks; ++i )
chunksOk &= LoadChunk(chunks, saveData);
// sort chunk vector
sort( chunks.begin(), chunks.end(), ChunkLessThan );
// return success/failure of load
return ( (elementsRead == 1) && chunksOk );
}
// load a single index linear offset entry from file, return true if loaded OK
// @saveData - save data in memory if true, just read & discard if false
bool BamStandardIndex::BamStandardIndexPrivate::LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData) {
size_t elementsRead = 0;
// read in number of linear offsets
int32_t numLinearOffsets;
elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_parent->m_indexStream);
if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets);
// set up destination vector (if we're saving the data)
LinearOffsetVector linearOffsets;
if ( saveData ) linearOffsets.reserve(numLinearOffsets);
// iterate over linear offsets
uint64_t linearOffset;
for ( int i = 0; i < numLinearOffsets; ++i ) {
elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_parent->m_indexStream);
if ( m_isBigEndian ) SwapEndian_64(linearOffset);
if ( saveData ) linearOffsets.push_back(linearOffset);
}
// sort linear offsets
sort ( linearOffsets.begin(), linearOffsets.end() );
// save in reference index entry if desired
if ( saveData ) refEntry.Offsets = linearOffsets;
// return success/failure of load
return ( elementsRead == (size_t)(numLinearOffsets + 1) );
}
bool BamStandardIndex::BamStandardIndexPrivate::LoadFirstReference(bool saveData) {
BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
return LoadReference((*indexBegin).first, saveData);
}
// load a single reference from file, return true if loaded OK
// @saveData - save data in memory if true, just read & discard if false
bool BamStandardIndex::BamStandardIndexPrivate::LoadReference(const int& refId, bool saveData) {
// look up refId
BamStandardIndexData::iterator indexIter = m_indexData.find(refId);
// if reference not previously loaded, create new entry
if ( indexIter == m_indexData.end() ) {
ReferenceIndex newEntry;
newEntry.HasAlignments = false;
m_indexData.insert( pair<int32_t, ReferenceIndex>(refId, newEntry) );
}
// load reference data
indexIter = m_indexData.find(refId);
ReferenceIndex& entry = (*indexIter).second;
bool loadedOk = true;
loadedOk &= LoadBins(entry, saveData);
loadedOk &= LoadLinearOffsets(entry, saveData);
return loadedOk;
}
// loads number of references, return true if loaded OK
bool BamStandardIndex::BamStandardIndexPrivate::LoadReferenceCount(int& numReferences) {
size_t elementsRead = 0;
// read reference count
elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
if ( m_isBigEndian ) SwapEndian_32(numReferences);
// return success/failure of load
return ( elementsRead == 1 );
}
// merges 'alignment chunks' in BAM bin (used for index building)
void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) {
// iterate over reference enties
BamStandardIndexData::iterator indexIter = m_indexData.begin();
BamStandardIndexData::iterator indexEnd = m_indexData.end();
for ( ; indexIter != indexEnd; ++indexIter ) {
// get BAM bin map for this reference
ReferenceIndex& refIndex = (*indexIter).second;
BamBinMap& bamBinMap = refIndex.Bins;
// iterate over BAM bins
BamBinMap::iterator binIter = bamBinMap.begin();
BamBinMap::iterator binEnd = bamBinMap.end();
for ( ; binIter != binEnd; ++binIter ) {
// get chunk vector for this bin
ChunkVector& binChunks = (*binIter).second;
if ( binChunks.size() == 0 ) continue;
ChunkVector mergedChunks;
mergedChunks.push_back( binChunks[0] );
// iterate over chunks
int i = 0;
ChunkVector::iterator chunkIter = binChunks.begin();
ChunkVector::iterator chunkEnd = binChunks.end();
for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
// get 'currentChunk' based on numeric index
Chunk& currentChunk = mergedChunks[i];
// get iteratorChunk based on vector iterator
Chunk& iteratorChunk = (*chunkIter);
// if chunk ends where (iterator) chunk starts, then merge
if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 )
currentChunk.Stop = iteratorChunk.Stop;