diff --git a/.settings/language.settings.xml b/.settings/language.settings.xml
index 2102df2..1df044e 100644
--- a/.settings/language.settings.xml
+++ b/.settings/language.settings.xml
@@ -5,7 +5,7 @@
-
+
diff --git a/Makefile b/Makefile
index 58ffa9b..ea0cd60 100644
--- a/Makefile
+++ b/Makefile
@@ -28,7 +28,7 @@ src/tomahawk/two \
-include src/algorithm/subdir.mk
-include src/subdir.mk
-LD_LIB_FLAGS := -fPIC -Wl,-rpath,$(CWD),-soname,ltomahawk.so.1
+LD_LIB_FLAGS := -fPIC -Wl,-rpath,"./",-soname,ltomahawk.so.1
# Inject git information
BRANCH := $(shell git rev-parse --abbrev-ref HEAD)
@@ -55,7 +55,7 @@ all: tomahawk
tomahawk: $(OBJS) $(USER_OBJS)
g++ -pthread -o "tomahawk" $(OBJS) $(USER_OBJS) $(LIBS)
@echo 'Constructing shared library...'
- g++ $(LD_LIB_FLAGS) -pthread -o ltomahawk.so.$(LIBVER_SCRIPT) $(OBJS) $(USER_OBJS) $(LIBS)
+ g++ $(LD_LIB_FLAGS) -pthread -o ltomahawk.so.$(LIBVER) $(OBJS) $(USER_OBJS) $(LIBS)
@echo 'Symlinking library...'
ln -sf ltomahawk.so.$(LIBVER) ltomahawk.so
diff --git a/README.md b/README.md
index b452243..5537169 100644
--- a/README.md
+++ b/README.md
@@ -325,9 +325,9 @@ Department of Genetics, University of Cambridge
Wellcome Trust Sanger Institute
### Acknowledgements
-[Professor John A Todd](https://www.ndm.ox.ac.uk/principal-investigators/researcher/john-todd), Nuffield Department of Medicine, University of Oxford
+[John A Todd](https://www.ndm.ox.ac.uk/principal-investigators/researcher/john-todd), Nuffield Department of Medicine, University of Oxford
[Chris Wallace](https://github.com/chr1swallace), MRC Biostatistics Unit, University of Cambridge
-[Professor Richard Durbin](https://github.com/richarddurbin), Wellcome Trust Sanger Institute, and Department of Genetics, University of Cambridge
+[Richard Durbin](https://github.com/richarddurbin), Wellcome Trust Sanger Institute, and Department of Genetics, University of Cambridge
### License
[MIT](LICENSE)
diff --git a/RELEASE.md b/RELEASE.md
index b35ae4c..9949663 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,24 @@
+# Release 0.5.2
+
+## Minor Features And Improvements
+* Now correctly filters `two` files
+
+# Release 0.5.1
+
+## Minor Features And Improvements
+* Meta index for faster queries of `two` files now builds correctly
+* Interval slicing for sorted `two` files now outputs filtering hits for the from positions only
+* Haplotype counts in binary `two` files are now stored as `double` instead of `float`
+
+# Release 0.5.0
+
+## Breaking Changes
+* All changes are breaking because of structural rechanges. No files generated prior to this release are supported.
+
+## Minor Features And Improvements
+* Build process now smoother
+* Shared C++ library is now built by default
+
# Release 0.4.0
## Breaking Changes
diff --git a/src/algorithm/load_balancer_block.h b/src/algorithm/load_balancer_block.h
index 44c0f52..4d10302 100644
--- a/src/algorithm/load_balancer_block.h
+++ b/src/algorithm/load_balancer_block.h
@@ -23,6 +23,14 @@ struct LoadBalancerBlock{
staggered(false)
{}
+ LoadBalancerBlock(const U32 fromRow, const U32 toRow, const U32 fromColumn, const U32 toColumn, bool staggered) :
+ fromRow(fromRow),
+ toRow(toRow),
+ fromColumn(fromColumn),
+ toColumn(toColumn),
+ staggered(staggered)
+ {}
+
~LoadBalancerBlock(){}
inline U32 getRows(void) const{ return(this->toRow - this->fromRow); }
@@ -47,15 +55,13 @@ struct LoadBalancerBlock{
}
friend std::ostream& operator<<(std::ostream& os, const LoadBalancerBlock& block){
- os << block.fromRow << '-' << block.toRow << '\t' << block.fromColumn << '-' << block.toColumn << (int)block.staggered;
+ os << "[" << block.fromRow << '-' << block.toRow << ", " << block.fromColumn << '-' << block.toColumn << "] staggered: " << (int)block.staggered;
return(os);
}
// Relative order
- U32 fromRow;
- U32 toRow;
- U32 fromColumn;
- U32 toColumn;
+ U32 fromRow, toRow;
+ U32 fromColumn, toColumn;
bool staggered;
};
diff --git a/src/algorithm/load_balancer_ld.cpp b/src/algorithm/load_balancer_ld.cpp
index cca18fd..aaf9d8a 100644
--- a/src/algorithm/load_balancer_ld.cpp
+++ b/src/algorithm/load_balancer_ld.cpp
@@ -2,37 +2,281 @@
namespace tomahawk{
-LoadBalancerLD::LoadBalancerLD() : selected_chunk(0), n_desired_chunks(1){}
+LoadBalancerLD::LoadBalancerLD() :
+ selected_chunk(0),
+ n_desired_chunks(1),
+ n_comparisons_chunk(0)
+{}
+
LoadBalancerLD::~LoadBalancerLD(){}
bool LoadBalancerLD::getSelectedLoad(){
- value_type selected = this->blocks[this->selected_chunk];
+ const value_type& selected = this->blocks[this->selected_chunk];
// If there are both equal
if(selected.fromRow == selected.fromColumn && selected.toRow == selected.toColumn){
this->data_to_load.push_back(std::pair(selected.fromRow, selected.toRow));
} else {
// No overlap
- this->data_to_load.push_back(std::pair(selected.fromRow, selected.toRow));
+ this->data_to_load.push_back(std::pair(selected.fromRow, selected.toRow));
this->data_to_load.push_back(std::pair(selected.fromColumn, selected.toColumn));
}
return true;
}
+bool LoadBalancerLD::getSelectedLoadInterval(const reader_type& reader, const std::vector< std::pair >& intervals){
+ if(intervals.size() == 0) return false;
+ // Transform absolute offsets into relative offsets
+ // and store the absolute block offsets
+ // e.g.
+ // 1: 0->6
+ // 2: 265->450
+ // 3: 530->559
+ // Cumulative distribution = [0,6,191,220]
+ // Resulting relative offsets
+ // 1: 0->6
+ // 2: 6->191
+ // 3: 191->220
+ //
+ // Therefore: to find the overlap of [A,B] and [X,Y] involves
+ // simple interval intersections.
+ std::vector< std::pair > old = this->data_to_load; // copy old
+ this->data_to_load.clear(); // clear old
+
+ std::vector cumsum(1, 0); // start with 0
+ //std::cerr << cumsum[0] << std::endl;
+ for(U32 i = 0; i < intervals.size(); ++i){
+ cumsum.push_back(intervals[i].second - intervals[i].first + cumsum.back());
+ //std::cerr << cumsum.back() << std::endl;
+ }
+
+ std::vector< std::pair > relative_intervals;
+ for(U32 i = 0; i < intervals.size(); ++i){
+ relative_intervals.push_back(std::pair(cumsum[i], cumsum[i+1]));
+ //std::cerr << "Relative: " << relative_intervals.back().first << "->" << relative_intervals.back().second << std::endl;
+ //std::cerr << "Absolute: " << intervals[i].first << "->" << intervals[i].second << std::endl;
+ }
+
+ //std::cerr << "Start desired" << std::endl;
+ //for(U32 i = 0 ; i < old.size(); ++i){
+ // std::cerr << old[i].first << "\t" << old[i].second << std::endl;
+ //}
+ //std::cerr << this->blocks[this->selected_chunk] << std::endl;
+ //std::cerr << "Start map" << std::endl;
+
+ // Cycle over desired relative regions
+ // overlap relative vs absolute
+ // [a, b] overlaps with [x, y] iff b > x and a < y.
+ // a = old[i].first --- desired relative
+ // b = old[i].second --- desired relative
+ // x = relative_intervals[j].first --- relative
+ // y = relative_intervals[j].second --- relative
+ for(U32 i = 0; i < old.size(); ++i){ // cycle over relative data to load
+ for(U32 j = 0; j < relative_intervals.size(); ++j){ // cycle over relative intervals
+ if(old[i].second > relative_intervals[j].first && old[i].first < relative_intervals[j].second){
+ //std::cerr << "desired collapsed: " << old[i].first << "-" << old[i].second << " overlapping with relative interval " << relative_intervals[j].first << "-" << relative_intervals[j].second << std::endl;
+ //std::cerr << "absolute: " << intervals[j].first << "->" << intervals[j].second << std::endl;
+
+ U32 start = 0;
+ U32 end = 0;
+
+ // if A < X -> start = X
+ if(old[i].first < relative_intervals[j].first) start = relative_intervals[j].first;
+ // else start = A
+ else start = old[i].first;
+ // if B > Y -> end = Y
+ if(old[i].second > relative_intervals[j].second) end = relative_intervals[j].second;
+ // else end = B
+ else end = old[i].second;
+
+ //std::cerr << start << "->" << end << std::endl;
+
+ // Remove start position from these
+ //std::cerr << start - relative_intervals[j].first << "->" << end - relative_intervals[j].first << std::endl;
+ //std::cerr << "final: " << intervals[j].first + start - relative_intervals[j].first << "->" << intervals[j].first + end - relative_intervals[j].first << std::endl;
+ this->data_to_load.push_back(std::pair(intervals[j].first + start - relative_intervals[j].first, intervals[j].first + end - relative_intervals[j].first));
+
+ } else {
+
+ }
+ }
+ }
+
+ // Dedupe
+ std::sort(this->data_to_load.begin(), this->data_to_load.end());
+ std::vector> deduped;
+ deduped.push_back(this->data_to_load[0]);
+ for(U32 i = 1; i < this->data_to_load.size(); ++i){
+ if(this->data_to_load[i].first <= deduped.back().second){
+ if(deduped.back().second < this->data_to_load[i].second)
+ deduped.back().second = this->data_to_load[i].second;
+ }
+ else deduped.push_back(this->data_to_load[i]);
+ }
+
+ //for(U32 i = 0; i < this->data_to_load.size(); ++i){
+ // std::cerr << "original: " << this->data_to_load[i].first << "->" << this->data_to_load[i].second << std::endl;
+ //}
+ //for(U32 i = 0; i < deduped.size(); ++i){
+ // std::cerr << "deduped: " << deduped[i].first << "->" << deduped[i].second << std::endl;
+ //}
+ this->data_to_load = deduped;
+
+ return true;
+}
+
+bool LoadBalancerLD::getSelectedLoadThreadsInterval(const reader_type& reader, const U32 threads, const std::vector< std::pair >& intervals){
+ //std::cerr << this->selected_chunk << "/" << this->blocks.size() << std::endl;
+ const value_type& selected = this->blocks[this->selected_chunk];
+ //std::cerr << selected << std::endl;
+ this->thread_distribution.resize(threads);
+
+ // If a square is attached to the diagonal
+ // In this case, workload has to be distrubuted on the
+ // upper triangular of comparisons to avoid duplicate
+ // computation
+ if(selected.isDiagonal()){
+ //std::cerr << "diagonal" << std::endl;
+ U64 n_comparisons_thread = selected.getSize() / threads;
+ if(threads == 1) n_comparisons_thread = std::numeric_limits::max();
+
+ U64 n_blocks_incrementor = 0;
+ std::pair current;
+ U32 currentThread = 0;
+
+ for(U32 i = selected.fromRow; i < selected.toRow; ++i){
+ current.first = i;
+ current.second = i;
+
+ for(U32 j = i; j < selected.toColumn; ++j){
+ ++n_blocks_incrementor;
+
+ if(n_blocks_incrementor > n_comparisons_thread){
+ current.second = j + 1; // not inclusive range [A, B)
+ if(current.second > selected.toColumn) current.second = selected.toColumn;
+ //std::cerr << "break counter: " << current.first << "->" << current.second << " with " << n_variants_counter << "/" << n_variants_partition << " of " << this->n_comparisons_chunk << std::endl;
+ this->thread_distribution[currentThread].push_back(LoadBalancerThread(i - selected.fromRow, current.first - selected.fromRow, current.second - selected.fromRow));
+
+ current.first = j + 1;
+ current.second = j;
+ n_blocks_incrementor = 0;
+ ++currentThread;
+ }
+ }
+
+ //std::cerr << current.second << std::endl;
+ if(current.second != selected.toRow){
+ current.second = selected.toRow;
+ //std::cerr << "break new line: " << current.first << "->" << current.second << " with " << n_variants_counter <<"/" << n_variants_partition << std::endl;
+ this->thread_distribution[currentThread].push_back(LoadBalancerThread(i - selected.fromRow, current.first - selected.fromRow, current.second - selected.fromRow));
+ //parts.push_back(current);
+ if(n_blocks_incrementor > n_comparisons_thread){
+ //std::cerr << "break here" << std::endl;
+ n_blocks_incrementor = 0;
+ ++currentThread;
+ }
+ }
+ }
+
+ /*
+ std::cerr << "currentThread: " << currentThread << std::endl;
+ std::cerr << selected.toRow - selected.fromRow << std::endl;
+ for(U32 i = 0; i < this->thread_distribution.size(); ++i){
+ for(U32 j = 0; j < this->thread_distribution[i].size(); ++j){
+ std::cerr << "idx: "<< i << "; row: " << this->thread_distribution[i][j].row << ":" << this->thread_distribution[i][j].fromColumn << "->" << this->thread_distribution[i][j].toColumn << "/" << reader.getIndex().getContainer().size() << std::endl;
+ }
+ }
+ std::cerr << "here" << std::endl;
+ */
+
+ } else {
+ //std::cerr << "square" << std::endl;
+ const value_type& selectedRow = this->blocks[this->selected_chunk];
+ const value_type& selectedCol = this->blocks[this->selected_chunk];
+
+ U64 n_comparisons_thread = selected.getSize() / threads;
+ if(threads == 1) n_comparisons_thread = std::numeric_limits::max();
+
+ U64 n_blocks_incrementor = 0;
+ std::pair current;
+ U32 currentThread = 0;
+
+
+ for(U32 i = selectedRow.fromRow; i < selectedRow.toRow; ++i){
+ current.first = selectedCol.fromColumn;
+ current.second = selectedCol.fromColumn;
+
+ for(U32 j = selectedCol.fromColumn; j < selectedCol.toColumn; ++j){
+ ++n_blocks_incrementor;
+
+ if(n_blocks_incrementor > n_comparisons_thread){
+ current.second = j + 1; // not n_comparisons_thread range [A, B)
+ if(current.second > selected.toColumn) current.second = selected.toColumn;
+ //std::cerr << "break counter: " << current.first << "->" << current.second << " with " << n_variants_counter << "/" << n_variants_partition << std::endl;
+ this->thread_distribution[currentThread].push_back(LoadBalancerThread(
+ i - selectedRow.fromRow,
+ (selectedRow.toRow - selectedRow.fromRow) + current.first - selectedCol.fromColumn,
+ (selectedRow.toRow - selectedRow.fromRow) + current.second - selectedCol.fromColumn));
+
+ current.first = j + 1;
+ current.second = j;
+ n_blocks_incrementor = 0;
+ ++currentThread;
+ }
+ }
+
+ //std::cerr << current.second << std::endl;
+ if(current.second != selected.toColumn){
+ current.second = selected.toColumn;
+ //std::cerr << "break new line: " << current.first << "->" << current.second << std::endl;
+ this->thread_distribution[currentThread].push_back(LoadBalancerThread(
+ i - selectedRow.fromRow,
+ (selectedRow.toRow - selectedRow.fromRow) + current.first - selectedCol.fromColumn,
+ (selectedRow.toRow - selectedRow.fromRow) + current.second - selectedCol.fromColumn));
+ //parts.push_back(current);
+ if(n_blocks_incrementor > n_comparisons_thread){
+ n_blocks_incrementor = 0;
+ ++currentThread;
+ }
+ }
+ }
+
+ /*
+ std::cerr << "currentThread: " << currentThread << std::endl;
+ std::cerr << selected.toRow - selected.fromRow << std::endl;
+ for(U32 i = 0; i < this->thread_distribution.size(); ++i){
+ for(U32 j = 0; j < this->thread_distribution[i].size(); ++j){
+ std::cerr << "idx: "<< i << "; row: " << this->thread_distribution[i][j].row << ":" << this->thread_distribution[i][j].fromColumn << "->" << this->thread_distribution[i][j].toColumn << "/" << reader.getIndex().getContainer().size() << std::endl;
+ }
+ }
+ std::cerr << "here" << std::endl;
+ */
+ }
+
+ //for(U32 i = 0; i < this->data_to_load.size(); ++i)
+ // std::cerr << i << '\t' << this->data_to_load[i].first << "->" << this->data_to_load[i].second << std::endl;
+
+ //std::cerr << "returning" << std::endl;
+ return true;
+}
+
bool LoadBalancerLD::getSelectedLoadThreads(const reader_type& reader, const U32 threads){
const value_type& selected = this->blocks[this->selected_chunk];
this->thread_distribution.resize(threads);
- // limit
+ // If a square is attached to the diagonal
+ // In this case, workload has to be distrubuted on the
+ // upper triangular of comparisons to avoid duplicate
+ // computation
if(selected.isDiagonal()){
//std::cerr << "diagnonal" << std::endl;
this->n_comparisons_chunk = 0;
for(U32 i = selected.fromRow; i < selected.toRow; ++i){
for(U32 j = i; j < selected.toColumn; ++j){
- if(i == j)
+ if(i == j) // upper triangular
this->n_comparisons_chunk += (reader.getIndex().getContainer().at(i).n_variants * reader.getIndex().getContainer().at(i).n_variants - reader.getIndex().getContainer().at(i).n_variants) / 2;
- else
+ else // square
this->n_comparisons_chunk += reader.getIndex().getContainer().at(i).n_variants * reader.getIndex().getContainer().at(j).n_variants;
}
}
@@ -180,12 +424,145 @@ bool LoadBalancerLD::setDesired(const S32 desired){
return true;
}
+bool LoadBalancerLD::BuildInterval(const reader_type& reader, const U32 threads){
+ if(reader.interval_tree_entries == nullptr){
+ std::cerr << helpers::timestamp("ERR", "BALANCER") << "Cannot use slicing function without providing intervals..." << std::endl;
+ return false;
+ }
+
+ // If slicing
+ std::vector< std::pair > target_blocks;
+
+ for(U32 i = 0; i < reader.getHeader().getMagic().n_contigs; ++i){
+ for(U32 j = 0; j < reader.interval_tree_entries[i].size(); ++j){
+ //std::cerr << reader.interval_tree_entries[i][j] << std::endl;
+ std::pair blocks = reader.getIndex().getContainer().findOverlap(reader.interval_tree_entries[i][j].contigID,
+ reader.interval_tree_entries[i][j].start,
+ reader.interval_tree_entries[i][j].stop);
+
+ if(blocks.first == 0 && blocks.second == 0){
+ std::cerr << helpers::timestamp("LOG", "BALANCER") << "No data found for interval (ID:" << reader.interval_tree_entries[i][j].contigID << ":" << reader.interval_tree_entries[i][j].start << "-" << reader.interval_tree_entries[i][j].stop << ")..." << std::endl;
+ continue;
+ }
+
+ //std::cerr << "found blocks: " << blocks.first << "->" << blocks.second << std::endl;
+
+ target_blocks.push_back(blocks);
+ //for(U32 b = blocks.first; b < blocks.second; ++b){
+ // std::cerr << "Matches: " << b << std::endl;
+ //}
+ }
+ }
+
+
+ // Slicing does not overlap any data
+ if(target_blocks.size() == 0){
+ std::cerr << helpers::timestamp("ERROR", "BALANCER") << "Could not find any data in the region(s) specified..." << std::endl;
+ return false;
+ }
+
+ std::sort(target_blocks.begin(), target_blocks.end());
+ std::vector< std::pair > target_blocks_merged;
+ target_blocks_merged.push_back(target_blocks[0]);
+ //std::cerr << target_blocks[0].first << "->" << target_blocks[0].second << std::endl;
+ for(U32 i = 1; i < target_blocks.size(); ++i){
+ //std::cerr << target_blocks[i].first << "->" << target_blocks[i].second << std::endl;
+ if(target_blocks[i].first <= target_blocks_merged.back().second)
+ target_blocks_merged.back().second = target_blocks[i].second;
+ else
+ target_blocks_merged.push_back(target_blocks[i]);
+ }
+
+ //std::cerr << "merged" << std::endl;
+ //for(U32 i = 0; i < target_blocks_merged.size(); ++i){
+ // std::cerr << target_blocks_merged[i].first << "->" << target_blocks_merged[i].second << std::endl;
+ //}
+
+ // Count available blocks;
+ U32 n_total_blocks = 0;
+ for(U32 j = 0; j < target_blocks_merged.size(); ++j)
+ n_total_blocks += target_blocks_merged[j].second - target_blocks_merged[j].first;
+
+ //std::cerr << "total blocks: " << n_total_blocks << std::endl;
+
+ if(this->n_desired_chunks != 1){
+ U32 cutSize = 1;
+ for(U32 i = 1; i < n_total_blocks; ++i){
+ if((i*i - i) / 2 + i == this->n_desired_chunks) // N choose 2 + N (upper triangular + diagonal)
+ cutSize = i;
+ }
+
+ //std::cerr << "n choose 2: n = " << cutSize << std::endl;
+ if(cutSize == 1){
+ std::cerr << helpers::timestamp("ERROR", "BALANCER") << "Cannot cut into " << this->n_desired_chunks << " chunks. Chunks Have to be in the set choose(chunks,2) + chunks..." << std::endl;
+ return(false);
+ }
+
+ const U32 blocks_per_partition = n_total_blocks / cutSize;
+ //std::cerr << "blocks/partition: " << blocks_per_partition << std::endl;
+ U32 total = 0; // Sanity
+ for(U32 i = 0; i < cutSize; ++i){ // column 0..N
+ for(U32 j = i; j < cutSize; ++j){ // row c..N
+ if(j + 1 == cutSize){
+ if(i + 1 == cutSize){
+ //std::cerr << this->blocks.size() << ", last one: " << blocks_per_partition*i << "->" << n_total_blocks << ", " << blocks_per_partition*j << "->" << n_total_blocks << std::endl;
+ this->blocks.push_back(value_type(blocks_per_partition*i, n_total_blocks, blocks_per_partition*j, n_total_blocks));
+ } else {
+ //std::cerr << this->blocks.size() << ", last one: " << blocks_per_partition*i << "->" << blocks_per_partition*(i+1) << ", " << blocks_per_partition*j << "->" << n_total_blocks << std::endl;
+ this->blocks.push_back(value_type(blocks_per_partition*i, blocks_per_partition*(i+1), blocks_per_partition*j, n_total_blocks));
+ }
+ } else {
+ //std::cerr << this->blocks.size() << ", normal: " << blocks_per_partition*i << "->" << blocks_per_partition*(i+1) << ", " << blocks_per_partition*j << "->" << blocks_per_partition*(j+1) << std::endl;
+ this->blocks.push_back(value_type(blocks_per_partition*i, blocks_per_partition*(i+1), blocks_per_partition*j, blocks_per_partition*(j+1)));
+ }
+ ++total;
+ }
+ }
+
+ //std::cerr << "First block: " << this->blocks[0] << std::endl;
+
+ if(total != this->n_desired_chunks){
+ std::cerr << helpers::timestamp("ERROR", "BALANCER") << "Corrupted balancing..." << std::endl;
+ return(false);
+ }
+ } else {
+ this->blocks.push_back(value_type(0, n_total_blocks, 0, n_total_blocks));
+ }
+
+
+
+ // Data to load
+ this->getSelectedLoad();
+ //for(U32 i = 0; i < this->data_to_load.size(); ++i){
+ // std::cerr << this->data_to_load[i].first << "->" << this->data_to_load[i].second << std::endl;
+ //}
+
+ if(this->getSelectedLoadThreadsInterval(reader, threads, target_blocks_merged) == false){
+ std::cerr << helpers::timestamp("ERROR", "BALANCER") << "Corrupted balancing..." << std::endl;
+ return(false);
+ }
+
+ //std::cerr << "here after threads" << std::endl;
+
+ if(this->getSelectedLoadInterval(reader, target_blocks_merged) == false){
+ std::cerr << helpers::timestamp("ERROR", "BALANCER") << "Corrupted balancing..." << std::endl;
+ return(false);
+ }
+ // Load absolute
+
+ return(true);
+}
+
bool LoadBalancerLD::Build(const reader_type& reader, const U32 threads){
if(this->selected_chunk > this->n_desired_chunks){
std::cerr << helpers::timestamp("ERROR", "BALANCER") << "Incorrectly selected block (" << this->selected_chunk << '/' << this->n_desired_chunks << ")..." << std::endl;
return false;
}
+ // Have intervals set
+ if(reader.interval_tree_entries != nullptr)
+ return(this->BuildInterval(reader, threads));
+
// If selecting > 1 chunk
if(this->n_desired_chunks != 1){
U32 cutSize = 1;
@@ -226,7 +603,7 @@ bool LoadBalancerLD::Build(const reader_type& reader, const U32 threads){
this->blocks.push_back(value_type(0, reader.getIndex().getContainer().size(), 0, reader.getIndex().getContainer().size()));
}
- // Data to loadt
+ // Data to load
this->getSelectedLoad();
// Get thread load
diff --git a/src/algorithm/load_balancer_ld.h b/src/algorithm/load_balancer_ld.h
index 40c964c..46154ba 100644
--- a/src/algorithm/load_balancer_ld.h
+++ b/src/algorithm/load_balancer_ld.h
@@ -25,10 +25,13 @@ class LoadBalancerLD{
~LoadBalancerLD();
bool getSelectedLoad();
+ bool getSelectedLoadInterval(const reader_type& reader, const std::vector< std::pair >& intervals);
bool getSelectedLoadThreads(const reader_type& reader, const U32 threads);
+ bool getSelectedLoadThreadsInterval(const reader_type& reader, const U32 threads, const std::vector< std::pair >& intervals);
bool setSelected(const S32 selected);
bool setDesired(const S32 desired);
bool Build(const reader_type& reader, const U32 threads);
+ bool BuildInterval(const reader_type& reader, const U32 threads);
bool BuildWindow(const reader_type& reader, const U32 threads, const U64 n_window_bases);
inline std::vector< std::pair >& getLoad(void){ return(this->data_to_load); }
diff --git a/src/algorithm/sort/output_sorter.cpp b/src/algorithm/sort/output_sorter.cpp
index 494c825..869bde5 100644
--- a/src/algorithm/sort/output_sorter.cpp
+++ b/src/algorithm/sort/output_sorter.cpp
@@ -193,6 +193,8 @@ bool OutputSorter::sortMerge(const std::string& inputFile, const std::string& de
writer.setPartialSorted(false);
writer.setSorted(true);
writer.flush();
+ writer.getIndex()->getController().isSorted = true;
+ writer.getIndex()->buildMetaIndex(this->reader.getHeader().magic_.n_contigs);
writer.writeFinal();
if(!SILENT)
diff --git a/src/calc.h b/src/calc.h
index bfb2bfa..1d82194 100644
--- a/src/calc.h
+++ b/src/calc.h
@@ -1,5 +1,5 @@
/*
-Copyright (C) 2016-2017 Genome Research Ltd.
+Copyright (C) 2016-2018 Genome Research Ltd.
Author: Marcus D. R. Klarqvist
Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -34,21 +34,23 @@ void calc_usage(void){
" the all variant sites are guaranteed to have the given phasing.\n"
"Usage: " << tomahawk::constants::PROGRAM_NAME << " calc [options] -i -o \n\n"
"Options:\n"
- " -i FILE input Tomahawk (required)\n"
- " -o FILE output file (required)\n"
- " -t INT number of CPU threads (default: maximum available)\n"
- " -c INT number of parts to split problem into (must be in c!2 + c unless -w is triggered)\n"
- " -C INT chosen part to compute (0 < -C < -c)\n"
- " -w INT sliding window width in bases (approximative)\n"
- " -p force computations to use phased math [null]\n"
- " -u force computations to use unphased math [null]\n"
- " -f use fast-mode: output will have correlations only (no matrices or tests) [null]\n"
- " -a INT minimum number of non-major genotypes in 2-by-2 matrix (default: 1)\n"
- " -P FLOAT Fisher's exact test / Chi-squared cutoff P-value (default: 1)\n"
- " -r FLOAT Pearson's R-squared minimum cut-off value (default: 0.1)\n"
- " -R FLOAT Pearson's R-squared maximum cut-off value (default: 1.0)\n"
- " -d Show real-time progress update in cerr [null]\n"
- " -s Hide all program messages [null]\n";
+ " -i FILE input Tomahawk (required)\n"
+ " -o FILE output file or file prefix (required)\n"
+ " -t INT number of CPU threads (default: maximum available)\n"
+ " -c INT number of parts to split problem into (must be in c!2 + c unless -w is triggered)\n"
+ " -C INT chosen part to compute (0 < -C < -c)\n"
+ " -w INT sliding window width in bases (approximative)\n"
+ " -I STRING filter interval :pos-pos (see manual)\n"
+ " -p force computations to use phased math\n"
+ " -u force computations to use unphased math\n"
+ " -f use fast-mode: output will have correlations only (no matrices or tests)\n"
+ " -S INT number of individuals to sample in fast-mode (default: 1000)\n"
+ " -a INT minimum number of non-major genotypes in 2-by-2 matrix (default: 1)\n"
+ " -P FLOAT Fisher's exact test / Chi-squared cutoff P-value (default: 1)\n"
+ " -r FLOAT Pearson's R-squared minimum cut-off value (default: 0.1)\n"
+ " -R FLOAT Pearson's R-squared maximum cut-off value (default: 1.0)\n"
+ " -d Show real-time progress update in cerr\n"
+ " -s Hide all program messages\n";
}
int calc(int argc, char** argv){
@@ -66,12 +68,14 @@ int calc(int argc, char** argv){
{"input", required_argument, 0, 'i' },
{"threads", optional_argument, 0, 't' },
{"output", required_argument, 0, 'o' },
+ {"interval", optional_argument, 0, 'I' },
{"parts", optional_argument, 0, 'c' },
{"partStart", optional_argument, 0, 'C' },
{"minP", optional_argument, 0, 'P' },
{"phased", no_argument, 0, 'p' },
{"unphased", no_argument, 0, 'u' },
{"fast-mode", no_argument, 0, 'f' },
+ {"samples", optional_argument, 0, 'S' },
{"minR2", optional_argument, 0, 'r' },
{"maxR2", optional_argument, 0, 'R' },
{"minMHF", optional_argument, 0, 'a' },
@@ -89,10 +93,11 @@ int calc(int argc, char** argv){
tomahawk::TomahawkCalcParameters& parameters = tomahawk.getParameters();
std::string input;
std::string output;
+ std::vector filter_regions;
double windowBases = -1, windowPosition = -1; // not implemented
- while ((c = getopt_long(argc, argv, "i:o:t:puP:a:A:r:R:w:W:sdc:C:f?", long_options, &option_index)) != -1){
+ while ((c = getopt_long(argc, argv, "i:o:t:puP:a:A:r:R:w:W:S:I:sdc:C:f?", long_options, &option_index)) != -1){
switch (c){
case 0:
std::cerr << "Case 0: " << option_index << '\t' << long_options[option_index].name << std::endl;
@@ -103,6 +108,9 @@ int calc(int argc, char** argv){
case 'o':
output = std::string(optarg);
break;
+ case 'I':
+ filter_regions.push_back(std::string(optarg));
+ break;
case 't':
parameters.n_threads = atoi(optarg);
if(parameters.n_threads <= 0){
@@ -125,18 +133,17 @@ int calc(int argc, char** argv){
return(1);
}
break;
- case 'r':
- parameters.R2_min = atof(optarg);
- if(parameters.R2_min < 0){
- std::cerr << tomahawk::helpers::timestamp("ERROR") << "Cannot have a negative minimum R-squared value" << std::endl;
- return(1);
- } else if(parameters.R2_min > 1){
- std::cerr << tomahawk::helpers::timestamp("ERROR")<< "Cannot have minimum R-squared value > 1" << std::endl;
- return(1);
- }
- break;
-
- case 'R':
+ case 'r':
+ parameters.R2_min = atof(optarg);
+ if(parameters.R2_min < 0){
+ std::cerr << tomahawk::helpers::timestamp("ERROR") << "Cannot have a negative minimum R-squared value" << std::endl;
+ return(1);
+ } else if(parameters.R2_min > 1){
+ std::cerr << tomahawk::helpers::timestamp("ERROR")<< "Cannot have minimum R-squared value > 1" << std::endl;
+ return(1);
+ }
+ break;
+ case 'R':
parameters.R2_max = atof(optarg);
if(parameters.R2_max < 0){
std::cerr << tomahawk::helpers::timestamp("ERROR") << "Cannot have a negative maximum R-squared value" << std::endl;
@@ -147,19 +154,19 @@ int calc(int argc, char** argv){
}
break;
- case 'p':
+ case 'p':
parameters.force = tomahawk::TomahawkCalcParameters::force_method::phasedFunction;
break;
- case 'u':
+ case 'u':
parameters.force = tomahawk::TomahawkCalcParameters::force_method::unphasedFunction;
break;
- case 'f':
+ case 'f':
parameters.fast_mode = true;
break;
- case 'P':
+ case 'P':
parameters.P_threshold = atof(optarg);
if(parameters.P_threshold < 0){
std::cerr << tomahawk::helpers::timestamp("ERROR") << "Cannot have a negative cutoff P-value" << std::endl;
@@ -169,7 +176,7 @@ int calc(int argc, char** argv){
return(1);
}
break;
- case 'a':
+ case 'a':
parameters.minimum_sum_alternative_haplotype_count = atoi(optarg);
if(parameters.minimum_sum_alternative_haplotype_count < 0){
std::cerr << tomahawk::helpers::timestamp("ERROR") << "Cannot have negative minimum allele count" << std::endl;
@@ -177,7 +184,7 @@ int calc(int argc, char** argv){
}
break;
- case 'A':
+ case 'A':
parameters.maximum_sum_alternative_haplotype_count = atoi(optarg);
if(parameters.maximum_sum_alternative_haplotype_count < 0){
std::cerr << tomahawk::helpers::timestamp("ERROR") << "Cannot have negative maximum allele count" << std::endl;
@@ -185,7 +192,7 @@ int calc(int argc, char** argv){
}
break;
- case 'w':
+ case 'w':
if(std::regex_match(std::string(optarg), std::regex("^(([0-9]+)|([0-9]+[eE]{1}[0-9]+))$")) == false){
std::cerr << "not an integer" << std::endl;
return(1);
@@ -204,26 +211,26 @@ int calc(int argc, char** argv){
parameters.n_window_bases = windowBases;
break;
- case 'W':
- windowPosition = atoi(optarg);
- if(windowPosition <= 0){
- std::cerr << tomahawk::helpers::timestamp("ERROR") << "Cannot have a non-positive window size" << std::endl;
- return(1);
- }
+ case 'W':
+ windowPosition = atoi(optarg);
+ if(windowPosition <= 0){
+ std::cerr << tomahawk::helpers::timestamp("ERROR") << "Cannot have a non-positive window size" << std::endl;
+ return(1);
+ }
break;
- case 's':
- SILENT = 1;
- parameters.detailed_progress = false;
- break;
+ case 's':
+ SILENT = 1;
+ parameters.detailed_progress = false;
+ break;
- case 'd':
+ case 'd':
SILENT = 0;
parameters.detailed_progress = true;
break;
- default:
+ default:
std::cerr << tomahawk::helpers::timestamp("ERROR") << "Unrecognized option: " << (char)c << std::endl;
return(1);
}
@@ -245,12 +252,18 @@ int calc(int argc, char** argv){
std::cerr << tomahawk::helpers::timestamp("LOG") << "Calling calc..." << std::endl;
}
-
// Parse Tomahawk
if(!tomahawk.Open(input, output)){
std::cerr << tomahawk::helpers::timestamp("ERROR") << "Failed build!" << std::endl;
return 1;
}
- return(tomahawk.Calculate());
+ if(!tomahawk.addRegions(filter_regions)){
+ std::cerr << tomahawk::helpers::timestamp("ERROR") << "Failed to add region!" << std::endl;
+ return 1;
+ }
+
+ // Return should be 0 for success
+ if(tomahawk.Calculate()) return 0;
+ else return 1;
}
diff --git a/src/concat.h b/src/concat.h
index 222d49c..17c5211 100644
--- a/src/concat.h
+++ b/src/concat.h
@@ -1,5 +1,5 @@
/*
-Copyright (C) 2016-2017 Genome Research Ltd.
+Copyright (C) 2016-2018 Genome Research Ltd.
Author: Marcus D. R. Klarqvist
Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/src/import.h b/src/import.h
index d19dc2e..cb18dc8 100644
--- a/src/import.h
+++ b/src/import.h
@@ -1,5 +1,5 @@
/*
-Copyright (C) 2016-2017 Genome Research Ltd.
+Copyright (C) 2016-2018 Genome Research Ltd.
Author: Marcus D. R. Klarqvist
Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/src/index/index.cpp b/src/index/index.cpp
index fdf849e..2893e04 100644
--- a/src/index/index.cpp
+++ b/src/index/index.cpp
@@ -15,11 +15,15 @@ Index::Index(const char* const data, const U32 l_data) :
}
bool Index::buildMetaIndex(const U32 n_contigs){
- if(this->getContainer().size() == 0)
+ if(this->getContainer().size() == 0){
+ //std::cerr << "container size is 0..." << std::endl;
return false;
+ }
- if(this->isSorted() == false)
+ if(this->isSorted() == false){
+ //std::cerr << "data not sorted..." << std::endl;
return false;
+ }
meta_entry_type reference_entry;
reference_entry.index_begin = 0;
diff --git a/src/index/index.h b/src/index/index.h
index 7067210..de432b3 100644
--- a/src/index/index.h
+++ b/src/index/index.h
@@ -109,6 +109,37 @@ class Index{
*/
bool buildMetaIndex(const U32 n_contigs);
+ /**<
+ *
+ * @param contigID
+ * @param fromPos
+ * @param toPos
+ * @return
+ */
+ std::vector findOverlaps(const U32 contigID, const U32 fromPos, const U32 toPos) const{
+ if(contigID > this->sizeMeta())
+ return std::vector();
+
+ const U32 blockFrom = this->meta_container_[contigID].index_begin;
+ const U32 blockTo = this->meta_container_[contigID].index_end;
+ std::vector ret;
+
+ for(U32 i = blockFrom; i < blockTo; ++i){
+ // [a, b] overlaps with [x, y] iff b > x and a < y.
+ // a = fromPos
+ // b = toPos
+ // x = this->container_[i].min_position
+ // y = this->container_[i].max_position
+ if(toPos >= this->container_[i].min_position && fromPos < this->container_[i].max_position)
+ ret.push_back(i);
+
+ // Cannot extend any more
+ if(this->container_[i].min_position > toPos) break;
+ }
+
+ return ret;
+ }
+
private:
friend std::ofstream& operator<<(std::ofstream& stream, const self_type& index){
stream << index.getController();
@@ -117,10 +148,6 @@ class Index{
return(stream);
}
- friend std::ostream& operator<<(std::ostream& stream, const self_type& index){
- return(stream);
- }
-
private:
controller_type controller_;
meta_container_type meta_container_;
diff --git a/src/index/index_container.h b/src/index/index_container.h
index 4c3c6f9..e9ed82e 100644
--- a/src/index/index_container.h
+++ b/src/index/index_container.h
@@ -101,6 +101,7 @@ class IndexContainer{
// Overload basic operator
self_type& operator+=(const value_type& index_entry);
+ // These functions are used for `twk`
// Overlap functions: find blocks a target interval overlaps
// returns pairs of pointers. If pointerA == pointerB then the data is empty
std::pair findOverlap(const S32& contigID) const;
diff --git a/src/io/output_writer.h b/src/io/output_writer.h
index d021059..1c38935 100644
--- a/src/io/output_writer.h
+++ b/src/io/output_writer.h
@@ -143,6 +143,8 @@ class OutputWriter{
void writePrecompressedBlock(buffer_type& buffer, const U64& uncompressed_size);
+ inline index_type* getIndex(void){ return(this->index_); }
+
private:
void CheckOutputNames(const std::string& input);
diff --git a/src/io/vcf/vcf_header.cpp b/src/io/vcf/vcf_header.cpp
index d589a69..ef3160a 100644
--- a/src/io/vcf/vcf_header.cpp
+++ b/src/io/vcf/vcf_header.cpp
@@ -178,7 +178,7 @@ bool VCFHeader::__parseFirstLine(const char* const data, U32& offset){
offset = 4;
if(strncmp(&data[offset], &vcf::constants::HEADER_VCF_FORMAT[0], vcf::constants::HEADER_VCF_FORMAT.size()) != 0){
std::cerr << helpers::timestamp("ERROR", "BCF") << "Invalid VCF format..." << std::endl;
- std::cerr << std::string(&data[offset], 100) << std::endl;
+ //std::cerr << std::string(&data[offset], 100) << std::endl;
this->error_bit = VCF_ERROR_LINE1;
return false;
}
diff --git a/src/io/vcf/vcf_header.h b/src/io/vcf/vcf_header.h
index 9f4e187..64ec9bb 100644
--- a/src/io/vcf/vcf_header.h
+++ b/src/io/vcf/vcf_header.h
@@ -59,7 +59,7 @@ class VCFHeader {
tgzf_type tgzf_controller;
for(U32 i = 0; i < this->literal_lines.size(); ++i){
- std::cerr << this->literal_lines[i] << std::endl;
+ //std::cerr << this->literal_lines[i] << std::endl;
temp += this->literal_lines[i];
}
tgzf_controller.Deflate(temp);
diff --git a/src/main.cpp b/src/main.cpp
index abec28b..41a7582 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,5 +1,5 @@
/*
-Copyright (C) 2016-2017 Genome Research Ltd.
+Copyright (C) 2016-2018 Genome Research Ltd.
Author: Marcus D. R. Klarqvist
Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -29,7 +29,6 @@ DEALINGS IN THE SOFTWARE.
#include "view.h"
#include "sort.h"
#include "concat.h"
-#include "stats.h"
int main(int argc, char** argv){
if(tomahawk::helpers::isBigEndian()){
@@ -63,9 +62,6 @@ int main(int argc, char** argv){
} else if(strncmp(&argv[1][0], "concat", 6) == 0){
return(concat(argc, argv));
- } else if(strncmp(&argv[1][0], "stats", 5) == 0){
- return(stats(argc, argv));
-
} else if(strncmp(&argv[1][0], "--version", 9) == 0 || strncmp(&argv[1][0], "version", 7) == 0){
programMessage(false);
return(0);
diff --git a/src/sort.h b/src/sort.h
index adc4d43..6e0101b 100644
--- a/src/sort.h
+++ b/src/sort.h
@@ -1,5 +1,5 @@
/*
-Copyright (C) 2016-2017 Genome Research Ltd.
+Copyright (C) 2016-2018 Genome Research Ltd.
Author: Marcus D. R. Klarqvist
Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/src/stats.h b/src/stats.h
deleted file mode 100644
index a87b0f1..0000000
--- a/src/stats.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
-Copyright (C) 2016-2017 Genome Research Ltd.
-Author: Marcus D. R. Klarqvist
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-#include
-
-#include "tomahawk/tomahawk_reader.h"
-#include "tomahawk/two/output_filter.h"
-#include "tomahawk/two/TomahawkOutputReader.h"
-#include "utility.h"
-
-void stats_usage(void){
- programMessage();
- std::cerr <<
- "About: Calculates basic summary statistics for a TWK/TWO file.\n"
- " Data does not have to be indexed. However, operations are faster if they\n"
- " are.\n"
- "Usage: " << tomahawk::constants::PROGRAM_NAME << " stats [options] -i \n\n"
- "Options:\n"
- " -i FILE input Tomahawk (required)\n"
- " -b INT number of bins (default: 10)\n";
-}
-
-int stats(int argc, char** argv){
- if(argc < 3){
- stats_usage();
- return(1);
- }
-
- static struct option long_options[] = {
- {"input", required_argument, 0, 'i' },
- {"bins", optional_argument, 0, 'b' },
- {"silent", no_argument, 0, 's' },
- {0,0,0,0}
- };
-
- // Parameter defaults
- std::string input, output;
-
- int c = 0;
- int long_index = 0;
- S32 bins = 10;
- while ((c = getopt_long(argc, argv, "i:b:s", long_options, &long_index)) != -1){
- switch (c){
- case ':': /* missing option argument */
- fprintf(stderr, "%s: option `-%c' requires an argument\n",
- argv[0], optopt);
- break;
-
- case '?':
- default:
- fprintf(stderr, "%s: option `-%c' is invalid: ignored\n",
- argv[0], optopt);
- break;
-
- case 'i':
- input = std::string(optarg);
- break;
- case 'b':
- bins = atoi(optarg);
- if(bins <= 0){
- std::cerr << tomahawk::helpers::timestamp("ERROR") << "Cannot set bins to <= 0..." << std::endl;
- return(1);
- }
- break;
- case 's':
- SILENT = 1;
- break;
- }
- }
-
- if(input.length() == 0){
- std::cerr << tomahawk::helpers::timestamp("ERROR") << "No input file specified..." << std::endl;
- return(1);
- }
-
- if(!SILENT){
- programMessage();
- std::cerr << tomahawk::helpers::timestamp("LOG") << "Calling stats..." << std::endl;
- }
-
- // Todo: move out
- std::vector inputFile_parts = tomahawk::helpers::split(input, '.');
- std::string& end = inputFile_parts[inputFile_parts.size() - 1];
- std::transform(end.begin(), end.end(), end.begin(), ::tolower); // transform chars to lower case
-
- if(end == tomahawk::constants::OUTPUT_SUFFIX){
- std::cerr << tomahawk::helpers::timestamp("ERROR") << "Unsupported" << std::endl;
- return(1);
-
- } else if(end == tomahawk::constants::OUTPUT_LD_SUFFIX){
- tomahawk::TomahawkOutputReader reader;
- //reader.setWriteHeader(outputHeader);
- //tomahawk::TomahawkOutputFilterController& filter = reader.getFilter();
- //filter = tomahawk::TomahawkOutputFilterController(two_filter); // use copy ctor to transfer data
-
- //if(!reader.setWriterType(outputType))
- // return 1;
-
- //if(!reader.Open(input))
- // return 1;
-
- //if(!reader.AddRegions(filter_regions)){
- // std::cerr << tomahawk::helpers::timestamp("ERROR") << "Failed to add region!" << std::endl;
- // return 1;
- //}
-
- //if(!reader.summary(input, bins))
- // return 1;
-
- } else {
- std::cerr << tomahawk::helpers::timestamp("ERROR") << "Unrecognised input file format: " << input << std::endl;
- return 1;
- }
-
- return 0;
-}
diff --git a/src/support/MagicConstants.h b/src/support/MagicConstants.h
index 9934048..298a036 100644
--- a/src/support/MagicConstants.h
+++ b/src/support/MagicConstants.h
@@ -16,12 +16,12 @@ extern std::string INTERPRETED_COMMAND;
// Versioning
const int PROGRAM_VERSION_MAJOR = 0; // major
const int PROGRAM_VERSION_MINOR = 5;
-const int PROGRAM_VERSION_PATCH = 1;
+const int PROGRAM_VERSION_PATCH = 2;
const double ALLOWED_ROUNDING_ERROR = 0.001;
-const std::string PROGRAM_NAME = "tomahawk";
-const std::string OUTPUT_SUFFIX = "twk";
+const std::string PROGRAM_NAME = "tomahawk";
+const std::string OUTPUT_SUFFIX = "twk";
const std::string OUTPUT_LD_SUFFIX = "two";
// Headers
diff --git a/src/tomahawk/TomahawkCalc.cpp b/src/tomahawk/TomahawkCalc.cpp
index 0c32e0e..87c4f28 100644
--- a/src/tomahawk/TomahawkCalc.cpp
+++ b/src/tomahawk/TomahawkCalc.cpp
@@ -20,6 +20,10 @@ bool TomahawkCalc::Open(const std::string input, const std::string output){
return true;
}
+bool TomahawkCalc::addRegions(std::vector& positions){
+ return(this->reader.addRegions(positions));
+}
+
template
bool comparePairs(const std::pair& a, const std::pair& b){ return a.first < b.first; }
@@ -85,6 +89,11 @@ bool TomahawkCalc::Calculate(){
this->balancer.setDesired(this->parameters.n_chunks);
if(this->parameters.window_mode){
+ if(reader.interval_tree_entries != nullptr){
+ std::cerr << helpers::timestamp("ERROR", "BALANCER") << "Window mode when slicng is not supported..." << std::endl;
+ return false;
+ }
+
if(!this->balancer.BuildWindow(this->reader, this->parameters.n_threads, this->parameters.n_window_bases)){
std::cerr << helpers::timestamp("ERROR", "BALANCER") << "Failed to split into blocks..." << std::endl;
return false;
@@ -96,7 +105,6 @@ bool TomahawkCalc::Calculate(){
}
}
-
return(this->Calculate(this->balancer.getLoad()));
}
diff --git a/src/tomahawk/TomahawkCalc.h b/src/tomahawk/TomahawkCalc.h
index b8d1d2c..f56b3f5 100644
--- a/src/tomahawk/TomahawkCalc.h
+++ b/src/tomahawk/TomahawkCalc.h
@@ -24,6 +24,7 @@ class TomahawkCalc{
TomahawkCalc();
~TomahawkCalc();
bool Open(const std::string input, const std::string output);
+ bool addRegions(std::vector& positions);
bool Calculate(pair_vector& blocks);
bool Calculate(std::vector& blocks);
bool Calculate();
diff --git a/src/tomahawk/genotype_container_reference.h b/src/tomahawk/genotype_container_reference.h
index d34ab10..9a5f42d 100644
--- a/src/tomahawk/genotype_container_reference.h
+++ b/src/tomahawk/genotype_container_reference.h
@@ -2,6 +2,7 @@
#define TOMAHAWK_BASE_GENOTYPE_CONTAINER_REFERENCE_H_
#include // size_t, ptrdiff_t
+#include
#include "support/type_definitions.h"
#include "index/index_entry.h"
@@ -157,6 +158,24 @@ class GenotypeContainerReference{
// Update index
this->haplotype_bitvectors[i].indices = new U32[tempListIdx];
+ this->haplotype_bitvectors[i].n_actual = this->haplotype_bitvectors[i].l_list;
+ if(this->haplotype_bitvectors[i].n_actual > 1000){
+ int N = 1000;
+ std::vector a(tempListIdx);
+ a.assign(tempList, tempList + tempListIdx);
+ for(S32 i = N-1; i > 0; --i){
+ S32 r = rand() % (i + 1);
+ std::swap(a[i], a[r]);
+ }
+
+ this->haplotype_bitvectors[i].sampled_indicies = new U32[1000];
+ for(U32 j = 0; j < 1000; ++j){
+ this->haplotype_bitvectors[i].sampled_indicies[j] = a[j];
+ //std::cerr << a[j] << ",";
+ }
+ //std::sort(this->haplotype_bitvectors[i].sampled_indicies, &this->haplotype_bitvectors[i].sampled_indicies[1000]);
+ }
+
for(U32 j = 0; j < tempListIdx; ++j){
this->haplotype_bitvectors[i].indices[j] = tempList[j];
//this->sbbst_trees[i].Insert(tempList[j], tempList[j]);
diff --git a/src/tomahawk/haplotype_bitvector.h b/src/tomahawk/haplotype_bitvector.h
index 1f1bd13..f201904 100644
--- a/src/tomahawk/haplotype_bitvector.h
+++ b/src/tomahawk/haplotype_bitvector.h
@@ -8,8 +8,10 @@ struct HaplotypeBitVector{
public:
HaplotypeBitVector() :
n_bytes(0),
+ n_actual(0),
l_list(0),
indices(nullptr),
+ sampled_indicies(nullptr),
entries(nullptr)
{
@@ -17,13 +19,19 @@ struct HaplotypeBitVector{
HaplotypeBitVector(const U64 n_entries) :
n_bytes(ceil((double)n_entries/8)),
+ n_actual(0),
l_list(0),
indices(nullptr),
+ sampled_indicies(nullptr),
entries(new BYTE[n_bytes])
{
memset(this->entries, 0, this->n_bytes);
}
- ~HaplotypeBitVector(){ delete [] this->entries; delete [] this->indices; }
+ ~HaplotypeBitVector(){
+ delete [] this->entries;
+ delete [] this->indices;
+ delete [] this->sampled_indicies;
+ }
inline void reset(void){ memset(this->entries, 0, this->n_bytes); }
inline const bool operator[](const U32& position) const{ return(this->entries[position/8] & (1 << (position % 8))); }
@@ -33,8 +41,10 @@ struct HaplotypeBitVector{
public:
U32 n_bytes;
+ U32 n_actual;
U32 l_list; // number of odd items
U32* indices;
+ U32* sampled_indicies;
BYTE* entries;
};
diff --git a/src/tomahawk/ld_calculation_slave.h b/src/tomahawk/ld_calculation_slave.h
index dfb2b3b..88bb19f 100644
--- a/src/tomahawk/ld_calculation_slave.h
+++ b/src/tomahawk/ld_calculation_slave.h
@@ -153,26 +153,28 @@ const VECTOR_TYPE maskUnphasedLow = _mm_set1_epi8(UNPHASED_LOWER_MASK); // 0101
template
class LDSlave{
- typedef LDSlave self_type;
- typedef GenotypeMetaContainerReference manager_type;
- typedef base::GenotypeContainerReference block_type;
+ typedef LDSlave self_type;
+ typedef GenotypeMetaContainerReference manager_type;
+ typedef base::GenotypeContainerReference block_type;
+ typedef const support::GenotypeDiploidRun run_type;
+ typedef base::GenotypeContainerRunlengthObjects rle_type;
+
typedef const MetaEntry meta_type;
- typedef const support::GenotypeDiploidRun run_type;
typedef totempole::IndexEntry totempole_entry_type;
typedef io::OutputWriter output_writer_type;
typedef support::OutputEntrySupport helper_type;
typedef base::GenotypeBitvector<> simd_pair;
- typedef base::GenotypeContainerRunlengthObjects rle_type;
+
typedef interface::ProgressBar progress_bar_type;
typedef support::LDCalculationSIMDHelper<> simd_helper_type;
typedef TomahawkCalcParameters parameter_type;
// Work orders
- typedef LoadBalancerThread order_type;
+ typedef LoadBalancerThread order_type;
typedef std::vector work_order;
// Function pointers
- typedef bool (self_type::*phaseFunction)(helper_type& helper, const block_type& block1, const block_type& block2) const;
+ typedef bool (self_type::*phaseFunction)(const block_type& block1, const block_type& block2);
public:
LDSlave(const manager_type& manager,
@@ -226,33 +228,38 @@ class LDSlave{
bool SquareWorkOrder(const order_type& order);
// Comparator functions
- bool CompareBlocks(helper_type& helper, block_type& block1);
- bool CompareBlocks(helper_type& helper, block_type& block1, block_type& block2);
- bool CompareBlocksFunction(helper_type& helper, const block_type& block1, const block_type& block2) const;
- bool CompareBlocksFunctionForcedPhased(helper_type& helper, const block_type& block1, const block_type& block2) const;
- bool CompareBlocksFunctionForcedPhasedSimple(helper_type& helper, const block_type& block1, const block_type& block2) const;
- bool CompareBlocksFunctionForcedUnphased(helper_type& helper, const block_type& block1, const block_type& block2) const;
+ bool CompareBlocks(block_type& block1);
+ bool CompareBlocks(block_type& block1, block_type& block2);
+ bool CompareBlocksFunction(const block_type& block1, const block_type& block2);
+ bool CompareBlocksFunctionForcedPhased(const block_type& block1, const block_type& block2);
+ bool CompareBlocksFunctionForcedPhasedSimple(const block_type& block1, const block_type& block2);
+ bool CompareBlocksFunctionForcedPhasedSimpleSample(const block_type& block1, const block_type& block2);
+ bool CompareBlocksFunctionForcedUnphased(const block_type& block1, const block_type& block2);
// Phased functions
- bool CalculateLDPhased(helper_type& helper, const block_type& block1, const block_type& block2) const;
- bool CalculateLDPhasedSimple(helper_type& helper, const block_type& block1, const block_type& block2) const;
- bool CalculateLDPhasedSimpleSample(helper_type& helper, const block_type& block1, const block_type& block2) const;
- bool CalculateLDPhasedMath(helper_type& helper) const;
- bool CalculateLDPhasedMathSimple(helper_type& helper, const block_type& block1, const block_type& block2) const;
- bool CalculateLDPhasedVectorized(helper_type& helper, const block_type& block1, const block_type& block2) const;
- bool CalculateLDPhasedVectorizedNoMissing(helper_type& helper, const block_type& block1, const block_type& block2) const;
- bool CalculateLDPhasedVectorizedNoMissingNoTable(helper_type& helper, const block_type& block1, const block_type& block2) const;
+ bool CalculateLDPhased(const block_type& block1, const block_type& block2);
+ bool CalculateLDPhasedSimple(const block_type& block1, const block_type& block2);
+ bool CalculateLDPhasedSimpleSample(const block_type& block1, const block_type& block2);
+ bool CalculateLDPhasedVectorized(const block_type& block1, const block_type& block2);
+ bool CalculateLDPhasedVectorizedNoMissing(const block_type& block1, const block_type& block2);
+ bool CalculateLDPhasedVectorizedNoMissingNoTable(const block_type& block1, const block_type& block2);
// Unphased functions
- bool CalculateLDUnphased(helper_type& helper, const block_type& block1, const block_type& block2) const;
- bool CalculateLDUnphasedVectorized(helper_type& helper, const block_type& block1, const block_type& block2) const;
- bool CalculateLDUnphasedVectorizedNoMissing(helper_type& helper, const block_type& block1, const block_type& block2) const;
- bool CalculateLDUnphasedMath(helper_type& helper) const;
+ bool CalculateLDUnphased(const block_type& block1, const block_type& block2);
+ bool CalculateLDUnphasedVectorized(const block_type& block1, const block_type& block2);
+ bool CalculateLDUnphasedVectorizedNoMissing(const block_type& block1, const block_type& block2);
+
+ // Math
+ bool CalculateLDPhasedMath();
+ bool CalculateLDPhasedMathSimple(const block_type& block1, const block_type& block2);
+ bool CalculateLDUnphasedMath();
// General functions
- inline const double ChiSquaredUnphasedTable(const helper_type& helper, const double& target, const double& p, const double& q) const;
- bool ChooseF11Calculate(helper_type& helper, const double& target, const double& p, const double& q) const;
- inline void setFLAGs(helper_type& helper, const block_type& block1, const block_type& block2) const;
+ inline const double ChiSquaredUnphasedTable(const double& target, const double& p, const double& q);
+ bool ChooseF11Calculate(const double& target, const double& p, const double& q);
+ inline void setFLAGs(const block_type& block1, const block_type& block2);
+
+ // Tests
private:
const parameter_type& parameters;
@@ -296,6 +303,8 @@ class LDSlave{
const U32 vectorCycles; // Number of SIMD cycles (genotypes/2/vector width)
const U32 phased_unbalanced_adjustment; // Modulus remainder
const U32 unphased_unbalanced_adjustment; // Modulus remainder
+
+ helper_type helper;
};
template
@@ -355,7 +364,7 @@ LDSlave& LDSlave::operator+=(const LDSlave& other){
}
template
-inline void LDSlave::setFLAGs(helper_type& helper, const block_type& block1, const block_type& block2) const{
+inline void LDSlave::setFLAGs(const block_type& block1, const block_type& block2){
// If long range
const meta_type& mA = block1.currentMeta();
const meta_type& mB = block2.currentMeta();
@@ -371,7 +380,7 @@ inline void LDSlave::setFLAGs(helper_type& helper, const block_type& block1,
}
template
-bool LDSlave::CalculateLDUnphased(helper_type& helper, const block_type& block1, const block_type& block2) const{
+bool LDSlave::CalculateLDUnphased(const block_type& block1, const block_type& block2){
if(block1.currentMeta().AF == 0 || block2.currentMeta().AF == 0)
return false;
@@ -444,13 +453,13 @@ bool LDSlave::CalculateLDUnphased(helper_type& helper, const block_type& bloc
std::cout << a.getMeta().MAF*this->samples + b.getMeta().MAF*this->samples << '\t' << ticks_per_iter.count() << '\t';
#endif
- this->setFLAGs(helper, block1, block2);
+ this->setFLAGs(block1, block2);
- return(this->CalculateLDUnphasedMath(helper));
+ return(this->CalculateLDUnphasedMath());
}
template
-inline const double LDSlave::ChiSquaredUnphasedTable(const helper_type& helper, const double& target, const double& p, const double& q) const{
+inline const double LDSlave::ChiSquaredUnphasedTable(const double& target, const double& p, const double& q){
const double f12 = p - target;
const double f21 = q - target;
const double f22 = 1 - (target + f12 + f21);
@@ -479,7 +488,7 @@ inline const double LDSlave::ChiSquaredUnphasedTable(const helper_type& helpe
}
template
-bool LDSlave::ChooseF11Calculate(helper_type& helper, const double& target, const double& p, const double& q) const{
+bool LDSlave::ChooseF11Calculate(const double& target, const double& p, const double& q){
helper.haplotypeCounts[0] = target;
helper.haplotypeCounts[1] = p - helper.haplotypeCounts[0];
helper.haplotypeCounts[2] = q - helper.haplotypeCounts[0];
@@ -533,7 +542,7 @@ bool LDSlave::ChooseF11Calculate(helper_type& helper, const double& target, c
}
template
-bool LDSlave::CalculateLDUnphasedMath(helper_type& helper) const{
+bool LDSlave::CalculateLDUnphasedMath(){
// Total amount of non-missing alleles
helper.totalHaplotypeCounts = helper[0] + helper[1] + helper[4] + helper[5]
+ helper[16] + helper[17] + helper[20] + helper[21]
@@ -571,7 +580,7 @@ bool LDSlave::CalculateLDUnphasedMath(helper_type& helper) const{
helper.chiSqModel = 0;
// Use standard math
- return(this->CalculateLDPhasedMath(helper));
+ return(this->CalculateLDPhasedMath());
}
const double p = ((helper[0] + helper[1] + helper[4] + helper[5])*2.0
@@ -624,23 +633,23 @@ bool LDSlave::CalculateLDUnphasedMath(helper_type& helper) const{
const double* chosen = α
if(alpha >= minhap - constants::ALLOWED_ROUNDING_ERROR && alpha <= maxhap + constants::ALLOWED_ROUNDING_ERROR){
++biologically_possible;
- helper.chiSqModel = this->ChiSquaredUnphasedTable(helper, alpha, p, q);
+ helper.chiSqModel = this->ChiSquaredUnphasedTable(alpha, p, q);
chosen = α
}
if(beta >= minhap - constants::ALLOWED_ROUNDING_ERROR && beta <= maxhap + constants::ALLOWED_ROUNDING_ERROR){
++biologically_possible;
- if(this->ChiSquaredUnphasedTable(helper, beta, p, q) < helper.chiSqModel){
+ if(this->ChiSquaredUnphasedTable(beta, p, q) < helper.chiSqModel){
chosen = β
- helper.chiSqModel = this->ChiSquaredUnphasedTable(helper, beta, p, q);
+ helper.chiSqModel = this->ChiSquaredUnphasedTable(beta, p, q);
}
}
if(gamma >= minhap - constants::ALLOWED_ROUNDING_ERROR && gamma <= maxhap + constants::ALLOWED_ROUNDING_ERROR){
++biologically_possible;
- if(this->ChiSquaredUnphasedTable(helper, gamma, p, q) < helper.chiSqModel){
+ if(this->ChiSquaredUnphasedTable(gamma, p, q) < helper.chiSqModel){
chosen = γ
- helper.chiSqModel = this->ChiSquaredUnphasedTable(helper, gamma, p, q); // implicit
+ helper.chiSqModel = this->ChiSquaredUnphasedTable(gamma, p, q); // implicit
}
}
@@ -653,7 +662,7 @@ bool LDSlave::CalculateLDUnphasedMath(helper_type& helper) const{
helper.setMultipleRoots();
//++this->possible;
- return(this->ChooseF11Calculate(helper, *chosen, p, q));
+ return(this->ChooseF11Calculate(*chosen, p, q));
} else if(__diff > 0){ // Yn2 > h2
const double constant = sqrt(yN2 - h2);
@@ -671,10 +680,10 @@ bool LDSlave::CalculateLDUnphasedMath(helper_type& helper) const{
return false;
}
- helper.chiSqModel = this->ChiSquaredUnphasedTable(helper, alpha, p, q);
+ helper.chiSqModel = this->ChiSquaredUnphasedTable(alpha, p, q);
//++this->possible;
- return(this->ChooseF11Calculate(helper, alpha, p, q));
+ return(this->ChooseF11Calculate(alpha, p, q));
} else { // Yn2 == h2
const double delta = pow((yN/2.0*a),(1.0/3.0));
@@ -691,15 +700,15 @@ bool LDSlave::CalculateLDUnphasedMath(helper_type& helper) const{
const double* chosen = α
if(alpha >= minhap - constants::ALLOWED_ROUNDING_ERROR && alpha <= maxhap + constants::ALLOWED_ROUNDING_ERROR){
++biologically_possible;
- helper.chiSqModel = this->ChiSquaredUnphasedTable(helper, alpha, p, q);
+ helper.chiSqModel = this->ChiSquaredUnphasedTable(alpha, p, q);
chosen = α
}
if(gamma >= minhap - constants::ALLOWED_ROUNDING_ERROR && gamma <= maxhap + constants::ALLOWED_ROUNDING_ERROR){
++biologically_possible;
- if(this->ChiSquaredUnphasedTable(helper, gamma, p, q) < helper.chiSqModel){
+ if(this->ChiSquaredUnphasedTable(gamma, p, q) < helper.chiSqModel){
chosen = γ
- helper.chiSqModel = this->ChiSquaredUnphasedTable(helper, gamma, p, q); // implicit
+ helper.chiSqModel = this->ChiSquaredUnphasedTable(gamma, p, q); // implicit
}
}
@@ -709,13 +718,13 @@ bool LDSlave::CalculateLDUnphasedMath(helper_type& helper) const{
}
//++this->possible;
- return(this->ChooseF11Calculate(helper, *chosen, p, q));
+ return(this->ChooseF11Calculate(*chosen, p, q));
}
return(false);
}
template
-bool LDSlave::CalculateLDUnphasedVectorizedNoMissing(helper_type& helper, const block_type& block1, const block_type& block2) const{
+bool LDSlave::CalculateLDUnphasedVectorizedNoMissing(const block_type& block1, const block_type& block2){
helper.resetUnphased();
helper_simd.counters[0] = 0;
@@ -838,15 +847,15 @@ bool LDSlave::CalculateLDUnphasedVectorizedNoMissing(helper_type& helper, con
std::cout << ticks_per_iter.count() << '\n';
#endif
- this->setFLAGs(helper, block1, block2);
- return(this->CalculateLDUnphasedMath(helper));
+ this->setFLAGs(block1, block2);
+ return(this->CalculateLDUnphasedMath());
}
template
-bool LDSlave::CalculateLDUnphasedVectorized(helper_type& helper, const block_type& block1, const block_type& block2) const{
+bool LDSlave::CalculateLDUnphasedVectorized(const block_type& block1, const block_type& block2){
#if SLAVE_DEBUG_MODE < 6
if(block1.currentMeta().has_missing == 0 && block2.currentMeta().has_missing == 0)
- return(this->CalculateLDUnphasedVectorizedNoMissing(helper, block1, block2));
+ return(this->CalculateLDUnphasedVectorizedNoMissing(block1, block2));
#endif
helper.resetUnphased();
@@ -980,15 +989,15 @@ bool LDSlave::CalculateLDUnphasedVectorized(helper_type& helper, const block_
std::cout << ticks_per_iter.count() << '\n';
#endif
- this->setFLAGs(helper, block1, block2);
- return(this->CalculateLDUnphasedMath(helper));
+ this->setFLAGs(block1, block2);
+ return(this->CalculateLDUnphasedMath());
}
template
-bool LDSlave::CalculateLDPhasedVectorized(helper_type& helper, const block_type& block1, const block_type& block2) const{
+bool LDSlave::CalculateLDPhasedVectorized(const block_type& block1, const block_type& block2){
#if SLAVE_DEBUG_MODE < 6
if(block1.currentMeta().has_missing == 0 && block2.currentMeta().has_missing == 0){
- return(this->CalculateLDPhasedVectorizedNoMissing(helper, block1, block2));
+ return(this->CalculateLDPhasedVectorizedNoMissing(block1, block2));
//return(this->CalculateLDPhasedVectorizedNoMissingNoTable(helper, block1, block2));
}
#endif
@@ -1104,12 +1113,12 @@ bool LDSlave::CalculateLDPhasedVectorized(helper_type& helper, const block_ty
std::cout << "V\t" << a.getMeta().MAF*this->samples + b.getMeta().MAF*this->samples << '\t' << ticks_per_iter.count() << '\n';
#endif
- this->setFLAGs(helper, block1, block2);
- return(this->CalculateLDPhasedMath(helper));
+ this->setFLAGs(block1, block2);
+ return(this->CalculateLDPhasedMath());
}
template
-bool LDSlave::CalculateLDPhasedVectorizedNoMissing(helper_type& helper, const block_type& block1, const block_type& block2) const{
+bool LDSlave::CalculateLDPhasedVectorizedNoMissing(const block_type& block1, const block_type& block2){
helper.resetPhased();
helper_simd.counters[0] = 0;
helper_simd.counters[1] = 0;
@@ -1206,13 +1215,13 @@ bool LDSlave::CalculateLDPhasedVectorizedNoMissing(helper_type& helper, const
std::cout << "V\t" << a.getMeta().MAF*this->samples + b.getMeta().MAF*this->samples << "\t" << ticks_per_iter.count() << '\n';
#endif
- this->setFLAGs(helper, block1, block2);
+ this->setFLAGs(block1, block2);
- return(this->CalculateLDPhasedMath(helper));
+ return(this->CalculateLDPhasedMath());
}
template
-bool LDSlave::CalculateLDPhasedVectorizedNoMissingNoTable(helper_type& helper, const block_type& block1, const block_type& block2) const{
+bool LDSlave::CalculateLDPhasedVectorizedNoMissingNoTable(const block_type& block1, const block_type& block2){
helper.resetPhased();
helper_simd.counters[0] = 0;
@@ -1233,7 +1242,7 @@ bool LDSlave::CalculateLDPhasedVectorizedNoMissingNoTable(helper_type& helper
#define ITER_SHORT { \
__intermediate = PHASED_REFREF(vectorA[i], vectorB[i]); \
- POPCOUNT(helper_simd.counters[0], __intermediate); \
+ POPCOUNT(helper_simd.counters[0], __intermediate); \
i += 1; \
}
@@ -1260,11 +1269,11 @@ bool LDSlave::CalculateLDPhasedVectorizedNoMissingNoTable(helper_type& helper
helper[0] = (tailSmallest + frontSmallest) * GENOTYPE_TRIP_COUNT*2 + helper_simd.counters[0] - this->phased_unbalanced_adjustment;
//this->setFLAGs(block1, block2);
- return(this->CalculateLDPhasedMathSimple(helper, block1, block2));
+ return(this->CalculateLDPhasedMathSimple(block1, block2));
}
template
-bool LDSlave::CalculateLDPhasedSimple(helper_type& helper, const block_type& block1, const block_type& block2) const{
+bool LDSlave::CalculateLDPhasedSimple(const block_type& block1, const block_type& block2){
helper.resetPhased();
@@ -1275,7 +1284,6 @@ bool LDSlave::CalculateLDPhasedSimple(helper_type& helper, const block_type&
const U32 n_total = bitvectorA.l_list + bitvectorB.l_list;
U32 n_same = 0;
-
// compare A to B
if(bitvectorA.l_list >= bitvectorB.l_list){
for(U32 i = 0; i < n_cycles; ++i){
@@ -1287,59 +1295,66 @@ bool LDSlave::CalculateLDPhasedSimple(helper_type& helper, const block_type&
}
}
- /*
- // test
- U32 n_same_second = 0;
- if(bitvectorA.l_list >= bitvectorB.l_list){
- for(U32 i = 0; i < n_cycles; ++i){
- n_same_second += block1.currentSBBST().GetPtr(bitvectorB.indices[i]) != nullptr;
- }
-
- } else {
- for(U32 i = 0; i < n_cycles; ++i){
- n_same_second += block2.currentSBBST().GetPtr(bitvectorA.indices[i]) != nullptr;
- }
- }
-*/
-
- //if(n_same != n_same_second)
- // std::cerr << n_same << "\t" << n_same_second << std::endl;
-
-
helper[0] = 2*this->samples - (n_total - n_same);
- return(this->CalculateLDPhasedMathSimple(helper, block1, block2));
+ return(this->CalculateLDPhasedMathSimple(block1, block2));
}
template
-bool LDSlave::CalculateLDPhasedSimpleSample(helper_type& helper, const block_type& block1, const block_type& block2) const{
+bool LDSlave::CalculateLDPhasedSimpleSample(const block_type& block1, const block_type& block2){
helper.resetPhased();
const base::HaplotypeBitVector& bitvectorA = block1.currentHaplotypeBitvector();
const base::HaplotypeBitVector& bitvectorB = block2.currentHaplotypeBitvector();
const U32 n_cycles = 1000;
- U32 n_same = 0;
+ const int64_t n_total = bitvectorA.l_list + bitvectorB.l_list; // Number of positions that are non-reference
+ int64_t n_same = 0;
// compare A to B
if(bitvectorA.l_list >= bitvectorB.l_list){
for(U32 i = 0; i < n_cycles; ++i){
- n_same += bitvectorA.get(bitvectorB.indices[i]);
+ n_same += bitvectorA.get(bitvectorB.sampled_indicies[i]); // If bit[position] is set in both then is not jointly REF
}
+ // Linear estimator
+ n_same *= (double)bitvectorB.l_list / n_cycles;
} else {
for(U32 i = 0; i < n_cycles; ++i){
- n_same += bitvectorB.get(bitvectorA.indices[i]);
+ n_same += bitvectorB.get(bitvectorA.sampled_indicies[i]); // If bit[position] is set in both then is not jointly REF
}
+ // Linear estimator
+ n_same *= (double)bitvectorA.l_list / n_cycles;
+ }
+
+ // Lower bounds
+ n_same = std::max(n_total - 2*(int64_t)this->samples, n_same);
+
+ /*
+ assert(n_same <= n_total);
+ if(2*(int64_t)this->samples - (n_total - n_same) < 0){
+ std::cerr << 2*(int64_t)this->samples - (n_total - n_same) << "\t" << n_total << "\t" << n_same << "\t" << n_total - n_same << std::endl;
+ std::cerr << n_total - 2*this->samples << " OR " << n_same << std::endl;
+ std::cerr << std::max(n_total - 2*this->samples, n_same) << std::endl;
+ exit(1);
}
+ */
- helper[0] = 2*this->samples - (n_cycles - n_same);
+ //std::cerr << n_same << "\t" << factor << "\t" << n_same*factor << "\t" << block1.currentMeta().AF << "\t" << block2.currentMeta().AF << std::endl;
+
+ // Number of joint REF = 2N - (total alleles not REF - total joint REF)
+ helper[0] = 2*this->samples - (n_total - n_same);
helper.setSampled(true);
- return(this->CalculateLDPhasedMathSimple(helper, block1, block2));
+ return(this->CalculateLDPhasedMathSimple(block1, block2));
+ /*
+ {
+ std::cerr << helper[0] << ": " << helper.R2 << " with " << n_same << std::endl;
+ }
+ */
}
template
-bool LDSlave::CalculateLDPhased(helper_type& helper, const block_type& block1, const block_type& block2) const{
+bool LDSlave::CalculateLDPhased(const block_type& block1, const block_type& block2){
helper.resetPhased();
#if SLAVE_DEBUG_MODE == 4 || SLAVE_DEBUG_MODE == 5
typedef std::chrono::duration Cycle;
@@ -1412,13 +1427,13 @@ bool LDSlave::CalculateLDPhased(helper_type& helper, const block_type& block1
std::cout << a.getMeta().MAF*this->samples + b.getMeta().MAF*this->samples << '\t' << ticks_per_iter.count() << '\n';
#endif
- this->setFLAGs(helper, block1, block2);
+ this->setFLAGs(block1, block2);
- return(this->CalculateLDPhasedMath(helper));
+ return(this->CalculateLDPhasedMath());
}
template
-bool LDSlave::CalculateLDPhasedMath(helper_type& helper) const{
+bool LDSlave::CalculateLDPhasedMath(){
// Trigger phased flag
helper.setUsedPhasedMath();
@@ -1433,8 +1448,10 @@ bool LDSlave::CalculateLDPhasedMath(helper_type& helper) const{
//++this->possible;
// Filter by total minor haplotype frequency
- if(helper.getTotalAltHaplotypeCount() < this->parameters.minimum_sum_alternative_haplotype_count)
+ if(helper.getTotalAltHaplotypeCount() < this->parameters.minimum_sum_alternative_haplotype_count){
+ //std::cerr << "FILTER: " << helper.getTotalAltHaplotypeCount() << std::endl;
return false;
+ }
// Haplotype frequencies
@@ -1489,16 +1506,19 @@ bool LDSlave::CalculateLDPhasedMath(helper_type& helper) const{
}
template
-bool LDSlave::CalculateLDPhasedMathSimple(helper_type& helper, const block_type& block1, const block_type& block2) const{
+bool LDSlave::CalculateLDPhasedMathSimple(const block_type& block1, const block_type& block2){
//++this->possible;
// If haplotype count for (0,0) >
- if(helper[0] >= 2*this->samples - this->parameters.minimum_sum_alternative_haplotype_count)
+ if(helper[0] > 2*this->samples - this->parameters.minimum_sum_alternative_haplotype_count)
return false;
// D = (joint HOM_HOM) - (HOM_A * HOM_B) = pAB - pApB
+ const double divisor = block1.currentMeta().AF * (1 - block1.currentMeta().AF) * block2.currentMeta().AF * (1 - block2.currentMeta().AF);
+ if(divisor == 0) return false;
+
helper.D = helper[0]/(2*this->samples) - block1.currentMeta().AF*block2.currentMeta().AF;
- helper.R2 = helper.D*helper.D / ( block1.currentMeta().AF * (1 - block1.currentMeta().AF) * block2.currentMeta().AF * (1 - block2.currentMeta().AF) );
+ helper.R2 = helper.D*helper.D / divisor ;
if(helper.R2 >= this->parameters.R2_min && helper.R2 <= this->parameters.R2_max){
helper.R = sqrt(helper.R2);
@@ -1522,7 +1542,7 @@ bool LDSlave::CalculateLDPhasedMathSimple(helper_type& helper, const block_ty
helper.setFastMode();
helper.setPerfectLD(helper.R2 > 0.99);
helper.setCompleteLD(helper.Dprime > 0.99);
- this->setFLAGs(helper, block1, block2);
+ this->setFLAGs(block1, block2);
return true;
}
@@ -1534,7 +1554,6 @@ bool LDSlave::CalculateLDPhasedMathSimple(helper_type& helper, const block_ty
template
bool LDSlave::DiagonalWorkOrder(const order_type& order){
block_type block1(this->manager[order.row]);
- helper_type helper;
for(U32 j = order.fromColumn; j < order.toColumn; ++j){
//std::cerr << helpers::timestamp("DEBUG", "DIAG") << i << '/' << j << '\t' << order << std::endl;
@@ -1542,7 +1561,7 @@ bool LDSlave::DiagonalWorkOrder(const order_type& order){
this->CompareBlocks(helper, block1);
else {
block_type block2(this->manager[j]);
- this->CompareBlocks(helper, block1, block2);
+ this->CompareBlocks(block1, block2);
}
}
@@ -1553,15 +1572,14 @@ bool LDSlave::DiagonalWorkOrder(const order_type& order){
template
bool LDSlave::SquareWorkOrder(const order_type& order){
block_type block1(this->manager[order.row]);
- helper_type helper;
for(U32 j = order.fromColumn; j < order.toColumn; ++j){
//std::cerr << helpers::timestamp("DEBUG", "SQUARE") << i << '/' << j << '\t' << order << std::endl;
if(order.row == j)
- this->CompareBlocks(helper, block1);
+ this->CompareBlocks(block1);
else {
block_type block2(this->manager[j]);
- this->CompareBlocks(helper, block1, block2);
+ this->CompareBlocks(block1, block2);
}
}
@@ -1596,19 +1614,19 @@ bool LDSlave::Calculate(void){
}
template
-bool LDSlave::CompareBlocksFunction(helper_type& helper, const block_type& block1, const block_type& block2) const{
+bool LDSlave::CompareBlocksFunction(const block_type& block1, const block_type& block2){
#if SLAVE_DEBUG_MODE == 1 // 1 = No debug mode
if(block1.currentMeta().all_phased == 1 && block2.currentMeta().all_phased == 1){
if(block1.currentMeta().runs + block2.currentMeta().runs <= 20){
- return(this->CalculateLDPhased(helper, block1, block2));
+ return(this->CalculateLDPhased(block1, block2));
} else {
- return(this->CalculateLDPhasedVectorized(helper, block1, block2));
+ return(this->CalculateLDPhasedVectorized(block1, block2));
}
} else {
if(block1.currentMeta().runs + block2.currentMeta().runs <= 20){
- return(this->CalculateLDUnphased(helper, block1, block2));
+ return(this->CalculateLDUnphased(block1, block2));
} else {
- return(this->CalculateLDUnphasedVectorized(helper, block1, block2));
+ return(this->CalculateLDUnphasedVectorized(block1, block2));
}
}
@@ -1664,30 +1682,29 @@ bool LDSlave::CompareBlocksFunction(helper_type& helper, const block_type& bl
}
template
-bool LDSlave::CompareBlocksFunctionForcedPhased(helper_type& helper, const block_type& block1, const block_type& block2) const{
+bool LDSlave::CompareBlocksFunctionForcedPhased(const block_type& block1, const block_type& block2){
if(block1.currentMeta().runs + block2.currentMeta().runs <= 200){
- return(this->CalculateLDPhased(helper, block1, block2));
+ return(this->CalculateLDPhased(block1, block2));
} else {
- return(this->CalculateLDPhasedVectorized(helper, block1, block2));
+ return(this->CalculateLDPhasedVectorized(block1, block2));
}
}
template
-bool LDSlave::CompareBlocksFunctionForcedPhasedSimple(helper_type& helper, const block_type& block1, const block_type& block2) const{
+bool LDSlave::CompareBlocksFunctionForcedPhasedSimple(const block_type& block1, const block_type& block2){
if(std::min(block1.currentHaplotypeBitvector(). l_list,block2.currentHaplotypeBitvector().l_list) <= 1000){
if(block1.currentMeta().has_missing == false && block2.currentMeta().has_missing == false)
- return(this->CalculateLDPhasedSimple(helper, block1, block2));
- return(this->CalculateLDPhased(helper, block1, block2));
+ return(this->CalculateLDPhasedSimple(block1, block2));
+ return(this->CalculateLDPhased(block1, block2));
} else {
- //return(false);
if(block1.currentMeta().has_missing == false && block2.currentMeta().has_missing == false)
- return(this->CalculateLDPhasedVectorizedNoMissingNoTable(helper, block1, block2));
- return(this->CalculateLDPhasedVectorized(helper, block1, block2));
+ return(this->CalculateLDPhasedVectorizedNoMissingNoTable(block1, block2));
+ return(this->CalculateLDPhasedVectorized(block1, block2));
}
}
template
-bool LDSlave::CompareBlocksFunctionForcedUnphased(helper_type& helper, const block_type& block1, const block_type& block2) const{
+bool LDSlave::CompareBlocksFunctionForcedUnphased(const block_type& block1, const block_type& block2){
// Ignore when one or both is invariant
if(block1.currentMeta().AF == 0 || block2.currentMeta().AF == 0 ||
block1.currentMeta().runs == 1 || block2.currentMeta().runs == 1)
@@ -1697,15 +1714,15 @@ bool LDSlave::CompareBlocksFunctionForcedUnphased(helper_type& helper, const
}
if(block1.currentMeta().runs + block2.currentMeta().runs <= 200){
- return(this->CalculateLDUnphased(helper, block1, block2));
+ return(this->CalculateLDUnphased(block1, block2));
} else {
- return(this->CalculateLDUnphasedVectorized(helper, block1, block2));
+ return(this->CalculateLDUnphasedVectorized(block1, block2));
}
}
// Within-block comparisons
template
-bool LDSlave::CompareBlocks(helper_type& helper, block_type& block1){
+bool LDSlave::CompareBlocks(block_type& block1){
//std::cerr << helpers::timestamp("DEBUG", "DIAG-INTERNAL") << (block1.size()*block1.size()-block1.size())/2 << std::endl;
block1.resetIterator(); // make sure it is reset
block_type block2(block1);
@@ -1719,7 +1736,7 @@ bool LDSlave::CompareBlocks(helper_type& helper, block_type& block1){
//if(block2.currentMeta().position - block1.currentMeta().position > 1000000)
// continue;
- if((this->*phase_function_across)(helper, block1, block2)){
+ if((this->*phase_function_across)(block1, block2)){
this->output_writer.Add(block1.currentMeta(), block2.currentMeta(), block1.getTotempole(), block2.getTotempole(), helper);
}
++block2;
@@ -1735,7 +1752,7 @@ bool LDSlave::CompareBlocks(helper_type& helper, block_type& block1){
// Across block comparisons
template
-bool LDSlave::CompareBlocks(helper_type& helper, block_type& block1, block_type& block2){
+bool LDSlave::CompareBlocks(block_type& block1, block_type& block2){
//std::cerr << helpers::timestamp("DEBUG", "DIAG-SQUARE") << block1.size()*block2.size() << std::endl;
// Reset
@@ -1752,7 +1769,7 @@ bool LDSlave::CompareBlocks(helper_type& helper, block_type& block1, block_ty
//if(block2.currentMeta().position - block1.currentMeta().position > 1000000)
// break;
- if((this->*phase_function_across)(helper, block1, block2)){
+ if((this->*phase_function_across)(block1, block2)){
this->output_writer.Add(block1.currentMeta(), block2.currentMeta(), block1.getTotempole(), block2.getTotempole(), helper);
}
++block2;
diff --git a/src/tomahawk/output_container_reference.h b/src/tomahawk/output_container_reference.h
index 97e0ce8..50e25b0 100644
--- a/src/tomahawk/output_container_reference.h
+++ b/src/tomahawk/output_container_reference.h
@@ -38,7 +38,7 @@ class OutputContainerReference{
n_entries(data_buffer.size() / sizeof(value_type)),
__entries(reinterpret_cast(data_buffer.buffer))
{
- assert(n_entries >= 0);
+ assert(n_entries >= 0);
assert(data_buffer.size() % sizeof(value_type) == 0);
}
diff --git a/src/tomahawk/tomahawk_importer.cpp b/src/tomahawk/tomahawk_importer.cpp
index 4a5e96c..7ffd8dd 100644
--- a/src/tomahawk/tomahawk_importer.cpp
+++ b/src/tomahawk/tomahawk_importer.cpp
@@ -91,7 +91,6 @@ bool TomahawkImporter::BuildBCF(void){
bcf_entry_type entry;
while(reader.nextVariant(entry)){
if(entry.gt_support.hasEOV || entry.isBiallelicSimple() == false || entry.gt_support.n_missing > 3){
- std::cerr << "first: " << entry.gt_support.hasEOV << "," << entry.isBiallelicSimple() << ",miss: " << entry.gt_support.n_missing << std::endl;
entry.reset();
continue;
}
@@ -121,7 +120,6 @@ bool TomahawkImporter::BuildBCF(void){
// Parse lines
while(reader.nextVariant(entry)){
if(entry.gt_support.hasEOV || entry.isBiallelicSimple() == false || entry.gt_support.n_missing > 3){
- if(entry.gt_support.hasEOV) std::cerr << "EOV: " << entry.gt_support.hasGenotypes << "," << entry.gt_support.hasEOV << "," << entry.isBiallelicSimple() << " miss: " << entry.gt_support.n_missing << std::endl;
entry.reset();
continue;
}
@@ -220,7 +218,7 @@ bool TomahawkImporter::BuildVCF(void){
this->writer_.totempole_entry.contigID = *this->sort_order_helper.contigID;
if(!this->parseVCFLine(line)){
- std::cerr << helpers::timestamp("ERROR", "VCF") << "Failed parse" << std::endl;
+ std::cerr << helpers::timestamp("ERROR", "VCF") << "Failed to parse..." << std::endl;
return false;
}
@@ -271,8 +269,7 @@ bool TomahawkImporter::parseBCFLine(bcf_entry_type& line){
}
if(!SILENT){
- std::cerr << this->writer_.totempole_entry.n_variants << std::endl;
- std::cerr << helpers::timestamp("LOG", "IMPORT") << "Switch detected: " << this->vcf_header_->getContig(this->sort_order_helper.prevcontigID).name << "->" << this->vcf_header_->getContig(line.body->CHROM).name << "..." << std::endl;
+ std::cerr << helpers::timestamp("LOG", "IMPORT") << "Switch detected: " << this->vcf_header_->getContig(this->sort_order_helper.prevcontigID).name << " -> " << this->vcf_header_->getContig(line.body->CHROM).name << "..." << std::endl;
}
this->sort_order_helper.previous_position = 0;
diff --git a/src/tomahawk/tomahawk_importer.h b/src/tomahawk/tomahawk_importer.h
index 080f8e2..931131d 100644
--- a/src/tomahawk/tomahawk_importer.h
+++ b/src/tomahawk/tomahawk_importer.h
@@ -1,5 +1,5 @@
-#ifndef VCFPARSER_H_
-#define VCFPARSER_H_
+#ifndef TOMAHAWK_TOMAHAWK_IMPORTER_
+#define TOMAHAWK_TOMAHAWK_IMPORTER_
#include "io/bcf/bcf_reader.h"
#include "io/vcf/vcf_header.h"
@@ -112,4 +112,4 @@ class TomahawkImporter {
} /* namespace Tomahawk */
-#endif /* VCFPARSER_H_ */
+#endif /* TOMAHAWK_TOMAHAWK_IMPORTER_ */
diff --git a/src/tomahawk/tomahawk_reader.cpp b/src/tomahawk/tomahawk_reader.cpp
index 89ba413..a52b348 100644
--- a/src/tomahawk/tomahawk_reader.cpp
+++ b/src/tomahawk/tomahawk_reader.cpp
@@ -273,6 +273,55 @@ bool TomahawkReader::outputBlocks(std::vector& blocks){
return true;
}
+bool TomahawkReader::summaryIndividuals(){
+ typedef bool (tomahawk::TomahawkReader::*blockFunction)(std::vector& stats, const char* const data, const U32 blockID);
+ blockFunction func__ = nullptr;
+
+ switch(this->bit_width_){
+ case 1: func__ = &TomahawkReader::statsIndividual; break;
+ case 2: func__ = &TomahawkReader::statsIndividual; break;
+ case 4: func__ = &TomahawkReader::statsIndividual; break;
+ case 8: func__ = &TomahawkReader::statsIndividual; break;
+ default:
+ std::cerr << helpers::timestamp("ERROR") << "Word sizing could not be determined!" << std::endl;
+ exit(1);
+ break;
+ }
+
+ std::vector stats(this->getHeader().getMagic().n_samples);
+ U32 previousContigID = 0;
+ for(U32 i = 0; i < this->index_->getContainer().size(); ++i){
+ if(!this->getBlock(i)){
+ std::cerr << "failed to get block" << std::endl;
+ return false;
+ }
+
+ if(this->index_->getContainer()[i].contigID != previousContigID && i != 0){
+ for(U32 i = 0; i < stats.size(); ++i){
+ std::cout << previousContigID << "\t" << i << "\t" << stats[i] << std::endl;
+ stats[i].clear();
+ }
+
+ previousContigID = this->index_->getContainer()[i].contigID;
+ }
+
+ (*this.*func__)(stats, this->data_.data(),i);
+
+ // Reset buffers
+ this->outputBuffer_.reset(); // reset
+ this->data_.reset(); // reset
+
+ //std::cerr << i << "/" << this->index_->getContainer().size() << std::endl;
+ }
+
+ for(U32 i = 0; i < stats.size(); ++i){
+ std::cout << previousContigID << "\t" << i << "\t" << stats[i] << std::endl;
+ stats[i].clear();
+ }
+
+ return true;
+}
+
bool TomahawkReader::outputBlocks(){
typedef bool (tomahawk::TomahawkReader::*blockFunction)(const U32 blockID);
blockFunction func__ = nullptr;
@@ -345,5 +394,96 @@ bool TomahawkReader::outputBlocks(){
return true;
}
+bool TomahawkReader::addRegions(const std::vector& intervals){
+ // No interval strings given
+ if(intervals.size() == 0)
+ return true;
+
+ // Build interval tree and interval vector is not set
+ if(this->interval_tree_entries == nullptr)
+ this->interval_tree_entries = new std::vector[this->getHeader().getMagic().getNumberContigs()];
+
+ if(this->interval_tree == nullptr){
+ this->interval_tree = new tree_type*[this->getHeader().getMagic().getNumberContigs()];
+ for(U32 i = 0; i < this->getHeader().getMagic().getNumberContigs(); ++i)
+ this->interval_tree[i] = nullptr;
+ } else delete [] this->interval_tree;
+
+ // Parse intervals
+ for(U32 i = 0; i < intervals.size(); ++i){
+ interval_type interval;
+ // has colon (:)
+ if(intervals[i].find(':') != std::string::npos){
+ std::vector first = helpers::split(intervals[i], ':');
+ if(first.size() != 2){
+ std::cerr << helpers::timestamp("ERROR") << "Malformed interval string: " << intervals[i] << "..." << std::endl;
+ return false;
+ }
+
+ const S32 contigID = this->header_.getContigID(first[0]);
+ if(contigID == -1){
+ std::cerr << helpers::timestamp("ERROR") << "Contig: " << intervals[i] << " is not defined in this file..." << std::endl;
+ return false;
+ }
+
+ interval.state = algorithm::ContigInterval::INTERVAL_FULL;
+ interval.contigID = contigID;
+ std::vector sections = helpers::split(first[1],'-');
+ if(sections.size() == 2){ // Is contig + position->position
+ if(std::regex_match(sections[0], std::regex("^[0-9]{1,}([\\.]{1}[0-9]{1,})?([eE]{1}[0-9]{1,})?$"))){
+ interval.start = atof(sections[0].data());
+ } else {
+ std::cerr << helpers::timestamp("ERROR") << "Illegal number: " << sections[0] << " in interval string " << intervals[i] << std::endl;
+ return false;
+ }
+
+ if(std::regex_match(sections[1], std::regex("^[0-9]{1,}([\\.]{1}[0-9]{1,})?([eE]{1}[0-9]{1,})?$"))){
+ interval.stop = atof(sections[1].data());
+ } else {
+ std::cerr << helpers::timestamp("ERROR") << "Illegal number: " << sections[1] << " in interval string " << intervals[i] << std::endl;
+ return false;
+ }
+
+ } else if(sections.size() == 1){ // Is contig + position
+ interval.state = algorithm::ContigInterval::INTERVAL_POSITION;
+ if(std::regex_match(sections[0], std::regex("^[0-9]{1,}([\\.]{1}[0-9]{1,})?([eE]{1}[0-9]{1,})?$"))){
+ interval.start = atof(sections[0].data());
+ interval.stop = atof(sections[0].data());
+ } else {
+ std::cerr << helpers::timestamp("ERROR") << "Illegal number: " << sections[0] << " in interval string " << intervals[i] << std::endl;
+ return false;
+ }
+
+ } else {
+ std::cerr << helpers::timestamp("ERROR") << "Malformed interval string: " << intervals[i] << std::endl;
+ return false;
+ }
+ } else { // Is contig only
+ interval.state = algorithm::ContigInterval::INTERVAL_CONTIG_ONLY;
+ const S32 contigID = this->header_.getContigID(intervals[i]);
+ if(contigID == -1){
+ std::cerr << helpers::timestamp("ERROR") << "Contig: " << intervals[i] << " is not defined in this file..." << std::endl;
+ return false;
+ }
+ interval.contigID = contigID;
+ interval.start = 0;
+ interval.stop = this->header_.contigs_[contigID].n_bases + 1;
+ }
+ // Store interval
+ this->interval_tree_entries[interval.contigID].push_back(interval_type(interval));
+ }
+
+ for(U32 i = 0; i < this->getHeader().getMagic().getNumberContigs(); ++i){
+ delete this->interval_tree[i];
+
+ if(this->interval_tree_entries[i].size() != 0){
+ this->interval_tree[i] = new tree_type(this->interval_tree_entries[i]);
+ } else
+ this->interval_tree[i] = nullptr;
+ }
+
+ return true;
+}
+
} /* namespace Tomahawk */
diff --git a/src/tomahawk/tomahawk_reader.h b/src/tomahawk/tomahawk_reader.h
index f67cd54..dde71f1 100644
--- a/src/tomahawk/tomahawk_reader.h
+++ b/src/tomahawk/tomahawk_reader.h
@@ -23,6 +23,31 @@
namespace tomahawk {
+struct temp{
+ temp() :
+ n_hetA(0), n_hetB(0),
+ n_homA(0), n_homB(0),
+ n_missA(0), n_missB(0)
+ {
+
+ }
+
+ void clear(void){
+ this->n_hetA = 0; this->n_hetB = 0;
+ this->n_homA = 0; this->n_homB = 0;
+ this->n_missA = 0; this->n_missB = 0;
+ }
+
+ friend std::ostream& operator<<(std::ostream& os, const temp& self){
+ os << self.n_hetA << "\t" << self.n_hetB << "\t" << self.n_homA << "\t" << self.n_homB << "\t" << self.n_missA << "\t" << self.n_missB;
+ return(os);
+ }
+
+ U32 n_hetA, n_hetB;
+ U32 n_homA, n_homB;
+ U32 n_missA, n_missB;
+};
+
// TomahawkReader class simply reads compressed data from disk
class TomahawkReader {
typedef TomahawkCalcParameters parameter_type;
@@ -53,6 +78,7 @@ class TomahawkReader {
~TomahawkReader();
bool open(const std::string input);
+ bool addRegions(const std::vector& intervals);
// Reader functions
bool getBlocks(void);
@@ -73,95 +99,7 @@ class TomahawkReader {
bool outputBlocks(std::vector& blocks);
bool outputBlocks();
- bool addRegions(const std::vector& intervals){
- if(intervals.size() == 0)
- return true;
-
- if(this->interval_tree_entries == nullptr)
- this->interval_tree_entries = new std::vector[this->getHeader().getMagic().getNumberContigs()];
-
- if(this->interval_tree == nullptr){
- this->interval_tree = new tree_type*[this->getHeader().getMagic().getNumberContigs()];
- for(U32 i = 0; i < this->getHeader().getMagic().getNumberContigs(); ++i)
- this->interval_tree[i] = nullptr;
- }
-
- // Parse intervals
- for(U32 i = 0; i < intervals.size(); ++i){
- interval_type interval;
- // has colon (:)
- if(intervals[i].find(':') != std::string::npos){
- std::vector first = helpers::split(intervals[i], ':');
- if(first.size() != 2){
- std::cerr << helpers::timestamp("ERROR") << "Malformed interval string: " << intervals[i] << "..." << std::endl;
- return false;
- }
-
- const S32 contigID = this->header_.getContigID(first[0]);
- if(contigID == -1){
- std::cerr << helpers::timestamp("ERROR") << "Contig: " << intervals[i] << " is not defined in this file..." << std::endl;
- return false;
- }
-
- interval.state = algorithm::ContigInterval::INTERVAL_FULL;
- interval.contigID = contigID;
- std::vector sections = helpers::split(first[1],'-');
- if(sections.size() == 2){ // Is contig + position->position
- if(std::regex_match(sections[0], std::regex("^[0-9]{1,}([\\.]{1}[0-9]{1,})?([eE]{1}[0-9]{1,})?$"))){
- interval.start = atof(sections[0].data()) - 1;
- } else {
- std::cerr << helpers::timestamp("ERROR") << "Illegal number: " << sections[0] << " in interval string " << intervals[i] << std::endl;
- return false;
- }
-
- if(std::regex_match(sections[1], std::regex("^[0-9]{1,}([\\.]{1}[0-9]{1,})?([eE]{1}[0-9]{1,})?$"))){
- interval.stop = atof(sections[1].data());
- } else {
- std::cerr << helpers::timestamp("ERROR") << "Illegal number: " << sections[1] << " in interval string " << intervals[i] << std::endl;
- return false;
- }
-
- } else if(sections.size() == 1){ // Is contig + position
- interval.state = algorithm::ContigInterval::INTERVAL_POSITION;
- if(std::regex_match(sections[0], std::regex("^[0-9]{1,}([\\.]{1}[0-9]{1,})?([eE]{1}[0-9]{1,})?$"))){
- interval.start = atof(sections[0].data()) - 1;
- interval.stop = atof(sections[0].data());
- } else {
- std::cerr << helpers::timestamp("ERROR") << "Illegal number: " << sections[0] << " in interval string " << intervals[i] << std::endl;
- return false;
- }
-
- } else {
- std::cerr << helpers::timestamp("ERROR") << "Malformed interval string: " << intervals[i] << std::endl;
- return false;
- }
- } else { // Is contig only
- interval.state = algorithm::ContigInterval::INTERVAL_CONTIG_ONLY;
- const S32 contigID = this->header_.getContigID(intervals[i]);
- if(contigID == -1){
- std::cerr << helpers::timestamp("ERROR") << "Contig: " << intervals[i] << " is not defined in this file..." << std::endl;
- return false;
- }
- interval.contigID = contigID;
- interval.start = 0;
- interval.stop = this->header_.contigs_[contigID].n_bases + 1;
- }
- // Store interval
- this->interval_tree_entries[interval.contigID].push_back(interval_type(interval));
- }
-
- for(U32 i = 0; i < this->getHeader().getMagic().getNumberContigs(); ++i){
- delete this->interval_tree[i];
-
- if(this->interval_tree_entries[i].size() != 0){
- this->interval_tree[i] = new tree_type(this->interval_tree_entries[i]);
- } else
- this->interval_tree[i] = nullptr;
- }
-
- return true;
- }
-
+ bool summaryIndividuals();
inline const BYTE& getBitWidth(void) const{ return(this->bit_width_); }
inline const DataOffsetPair& getOffsetPair(const U32 p) const{ return(this->blockDataOffsets_[p]); }
@@ -177,6 +115,8 @@ class TomahawkReader {
template bool WriteBlock(const char* data, const U32 blockID);
template bool WriteBlockFilter(const char* data, const U32 blockID);
+ template bool statsIndividual(std::vector& stats, const char* data, const U32 blockID);
+
private:
U64 filesize_; // filesize
U64 offset_end_of_data_;
@@ -198,6 +138,7 @@ class TomahawkReader {
io::GenericWriterInterace* writer;
+public:
tree_type** interval_tree;
std::vector* interval_tree_entries;
};
@@ -362,6 +303,34 @@ bool TomahawkReader::WriteBlock(const char* const data, const U32 blockID){
return true;
}
+template
+bool TomahawkReader::statsIndividual(std::vector& stats, const char* const data, const U32 blockID){
+ base::GenotypeContainerReference o(data,
+ this->index_->getContainer()[blockID].uncompressed_size,
+ this->index_->getContainer()[blockID],
+ this->header_.magic_.getNumberSamples(),
+ false);
+
+
+
+ //std::vector stats(this->getHeader().getMagic().n_samples);
+ for(U32 j = 0; j < o.size(); ++j){
+ U32 cumsum = 0;
+ for(U32 i = 0; i < o.currentMeta().runs; ++i){
+ for(U32 r = 0; r < o[i].runs; ++r, cumsum++){
+ stats[cumsum].n_hetA += o[i].alleleA == 1;
+ stats[cumsum].n_hetB += o[i].alleleB == 1;
+ stats[cumsum].n_homA += o[i].alleleA == 0;
+ stats[cumsum].n_homB += o[i].alleleB == 0;
+ stats[cumsum].n_missA += o[i].alleleA == 2;
+ stats[cumsum].n_missB += o[i].alleleB == 2;
+ }
+ }
+ //std::cerr << cumsum << "/" << this->getHeader().getMagic().n_samples << std::endl;
+ assert(cumsum == this->getHeader().getMagic().n_samples);
+ }
+}
+
} /* namespace Tomahawk */
#endif /* TOMAHAWK_TOMAHAWK_READER_H_ */
diff --git a/src/tomahawk/two/TomahawkOutputReader.cpp b/src/tomahawk/two/TomahawkOutputReader.cpp
index ce6172c..848e279 100644
--- a/src/tomahawk/two/TomahawkOutputReader.cpp
+++ b/src/tomahawk/two/TomahawkOutputReader.cpp
@@ -17,6 +17,7 @@ TomahawkOutputReader::TomahawkOutputReader() :
filesize_(0),
offset_end_of_data_(0),
showHeader_(true),
+ output_json_(false),
index_(nullptr),
buffer_(3000000),
data_(3000000),
@@ -393,6 +394,11 @@ bool TomahawkOutputReader::addRegions(std::vector& positions){
if(positions.size() == 0)
return true;
+ // No contigs (usually means data has not been loaded)
+ if(this->getHeader().getMagic().getNumberContigs() == 0)
+ return false;
+
+ // Construct interval tree and interval vector if not set
if(this->interval_tree_entries == nullptr)
this->interval_tree_entries = new std::vector[this->getHeader().getMagic().getNumberContigs()];
@@ -402,6 +408,7 @@ bool TomahawkOutputReader::addRegions(std::vector& positions){
this->interval_tree[i] = nullptr;
}
+ // Parse and add region
if(!this->__addRegions(positions))
return false;
@@ -416,22 +423,23 @@ bool TomahawkOutputReader::addRegions(std::vector& positions){
}
bool TomahawkOutputReader::__addRegions(std::vector& positions){
+ // For each potential interval string in vector
for(U32 i = 0; i < positions.size(); ++i){
- if(positions[i].find(',') != std::string::npos){
+ if(positions[i].find(',') != std::string::npos){ // Test if string has a comma
std::vector ret = helpers::split(positions[i], ',');
- if(ret.size() == 1){
+ if(ret.size() == 1){ //
std::cerr << helpers::timestamp("ERROR", "INTERVAL") << "Illegal interval: " << positions[i] << "!" << std::endl;
return false;
} else if(ret.size() == 2){
// parse left
interval_type intervalLeft;
- if(this->__ParseRegionIndexed(ret[0], intervalLeft))
+ if(this->__ParseRegion(ret[0], intervalLeft))
this->interval_tree_entries[intervalLeft.contigID].push_back(interval_type(intervalLeft));
// parse right
interval_type intervalRight;
- if(this->__ParseRegionIndexed(ret[1], intervalRight))
+ if(this->__ParseRegion(ret[1], intervalRight))
this->interval_tree_entries[intervalRight.contigID].push_back(interval_type(intervalRight));
} else {
@@ -442,7 +450,7 @@ bool TomahawkOutputReader::__addRegions(std::vector& positions){
// Has no comma in string
else {
interval_type interval;
- if(this->__ParseRegionIndexed(positions[i], interval))
+ if(this->__ParseRegion(positions[i], interval))
this->interval_tree_entries[interval.contigID].push_back(interval_type(interval));
}
}
@@ -450,7 +458,11 @@ bool TomahawkOutputReader::__addRegions(std::vector& positions){
return true;
}
-bool TomahawkOutputReader::__ParseRegionIndexed(const std::string& region, interval_type& interval){
+bool TomahawkOutputReader::__ParseRegion(const std::string& region, interval_type& interval) const{
+ if(region.size() == 0)
+ return false;
+
+ // Search for colon
std::vector ret = helpers::split(region, ':');
// If vector does not contain a colon
@@ -534,17 +546,34 @@ bool TomahawkOutputReader::__viewOnly(void){
std::cout << this->getHeader().getLiterals() << '\n';
std::cout << "FLAG\tCHROM_A\tPOS_A\tCHROM_B\tPOS_B\tREF_REF\tREF_ALT\tALT_REF\tALT_ALT\tD\tDprime\tR\tR2\tP\tChiSqModel\tChiSqTable\n";
}
+ const std::string version_string = std::to_string(this->header_.magic_.major_version) + "." + std::to_string(this->header_.magic_.minor_version) + "." + std::to_string(this->header_.magic_.patch_version);
+
// Natural output required parsing
size_t n_total = 0;
//if(this->writer_output_type == WRITER_TYPE::natural){
+ typedef std::ostream& (entry_type::*func)(std::ostream& os, const contig_type* const contigs) const;
+ func a = &entry_type::write;
+ if(this->output_json_){
+ a = &entry_type::writeJSON;
+ std::cout << "{\n\"type\":\"tomahawk\",\n\"sorted\":" << (this->getIndex().getController().isSorted ? "true" : "false") << ",\n\"partial_sort\":" << (this->getIndex().getController().isPartialSorted ? "true" : "false");
+ std::cout << ",\n\"version\":\"" << version_string << "\",\n\"data\":[\n";
+ }
+
+ //this->parseBlock();
while(this->parseBlock()){
OutputContainerReference o = this->getContainerReference();
n_total += o.size();
- for(U32 i = 0; i < o.size(); ++i){
- o[i].write(std::cout, this->getHeader().contigs_);
+ //if(o.size() == 0) break;
+
+ (o[0].*a)(std::cout, this->getHeader().contigs_);
+ for(U32 i = 1; i < o.size(); ++i){
+ //std::cout << ",\n";
+ (o[i].*a)(std::cout, this->getHeader().contigs_);
}
}
+
+ //std::cout << "]\n}\n";
//std::cerr << "total: " << n_total << std::endl;
//}
// Binary output without filtering simply writes it back out
@@ -570,12 +599,40 @@ bool TomahawkOutputReader::__viewRegion(void){
std::cout << "FLAG\tCHROM_A\tPOS_A\tCHROM_B\tPOS_B\tREF_REF\tREF_ALT\tALT_REF\tALT_ALT\tD\tDprime\tR\tR2\tP\tChiSqModel\tChiSqTable\n";
}
- if(this->interval_tree != nullptr){
+ if(this->interval_tree == nullptr)
+ return false;
+
+ typedef bool (TomahawkOutputReader::*func_slice)(const entry_type& entry);
+ func_slice region_function = &TomahawkOutputReader::__checkRegionUnsorted;
+ if(this->getIndex().getController().isSorted){
+ if(!SILENT)
+ std::cerr << helpers::timestamp("LOG") << "Using sorted query..." << std::endl;
+
+ region_function = &TomahawkOutputReader::__checkRegionSorted;
+
+ for(U32 i = 0; i < this->getHeader().getMagic().getNumberContigs(); ++i){ // foreach contig
+ //std::cerr << "i: " << this->interval_tree_entries[i].size() << std::endl;
+ for(U32 j = 0; j < this->interval_tree_entries[i].size(); ++j){
+ //std::cerr << this->interval_tree_entries[i][j] << std::endl;
+ std::vector blocks = this->index_->findOverlaps(this->interval_tree_entries[i][j].contigID, this->interval_tree_entries[i][j].start, this->interval_tree_entries[i][j].stop);
+ for(U32 b = 0; b < blocks.size(); ++b){
+ output_container_reference_type o = this->getContainerReferenceBlock(blocks[b]);
+ for(U32 i = 0; i < o.size(); ++i)
+ (this->*region_function)(o[i]);
+
+ } // end of blocks
+ } // end of intervals in contig
+ } // end of contigs
+ } // end case sorted
+ else {
+ if(!SILENT)
+ std::cerr << helpers::timestamp("LOG") << "Using unsorted query..." << std::endl;
+
while(this->parseBlock()){
output_container_reference_type o(this->data_);
for(U32 i = 0; i < o.size(); ++i)
- this->__checkRegionNoIndex(o[i]);
- } // end while next block
+ (this->*region_function)(o[i]);
+ }
}
return true;
@@ -601,7 +658,48 @@ bool TomahawkOutputReader::__viewFilter(void){
return true;
}
-bool TomahawkOutputReader::__checkRegionNoIndex(const entry_type& entry){
+bool TomahawkOutputReader::__checkRegionSorted(const entry_type& entry){
+ typedef std::ostream& (entry_type::*func)(std::ostream& os, const contig_type* const contigs) const;
+ func a = &entry_type::write;
+
+ // If iTree for contigA exists
+ if(this->interval_tree[entry.AcontigID] != nullptr){
+ std::vector rets = this->interval_tree[entry.AcontigID]->findOverlapping(entry.Aposition, entry.Aposition);
+ if(rets.size() > 0){
+ for(U32 i = 0; i < rets.size(); ++i){
+ if(rets[i].value != nullptr){ // if linked
+ if((entry.BcontigID == rets[i].value->contigID) &&
+ (entry.Bposition >= rets[i].value->start &&
+ entry.Bposition <= rets[i].value->stop)){
+ if(this->filters_.filter(entry)){
+ //entry.write(std::cout, this->contigs);
+ //*this->writer << entry;
+ (entry.*a)(std::cout, this->getHeader().contigs_);
+ }
+
+ return true;
+ } // end match
+ } else { // not linked
+ if(this->filters_.filter(entry)){
+ //entry.write(std::cout, this->contigs);
+ //*this->writer << entry;
+ (entry.*a)(std::cout, this->getHeader().contigs_);
+ }
+
+ return true;
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+
+bool TomahawkOutputReader::__checkRegionUnsorted(const entry_type& entry){
+ typedef std::ostream& (entry_type::*func)(std::ostream& os, const contig_type* const contigs) const;
+ func a = &entry_type::write;
+
// If iTree for contigA exists
if(this->interval_tree[entry.AcontigID] != nullptr){
std::vector rets = this->interval_tree[entry.AcontigID]->findOverlapping(entry.Aposition, entry.Aposition);
@@ -614,7 +712,7 @@ bool TomahawkOutputReader::__checkRegionNoIndex(const entry_type& entry){
if(this->filters_.filter(entry)){
//entry.write(std::cout, this->contigs);
//*this->writer << entry;
- entry.write(std::cout, this->getHeader().contigs_);
+ (entry.*a)(std::cout, this->getHeader().contigs_);
}
return true;
@@ -623,7 +721,7 @@ bool TomahawkOutputReader::__checkRegionNoIndex(const entry_type& entry){
if(this->filters_.filter(entry)){
//entry.write(std::cout, this->contigs);
//*this->writer << entry;
- entry.write(std::cout, this->getHeader().contigs_);
+ (entry.*a)(std::cout, this->getHeader().contigs_);
}
return true;
@@ -644,7 +742,7 @@ bool TomahawkOutputReader::__checkRegionNoIndex(const entry_type& entry){
if(this->filters_.filter(entry)){
//entry.write(std::cout, this->contigs);
//*this->writer << entry;
- entry.write(std::cout, this->getHeader().contigs_);
+ (entry.*a)(std::cout, this->getHeader().contigs_);
}
return true;
} // end match
@@ -652,7 +750,7 @@ bool TomahawkOutputReader::__checkRegionNoIndex(const entry_type& entry){
if(this->filters_.filter(entry)){
//entry.write(std::cout, this->contigs);
//*this->writer << entry;
- entry.write(std::cout, this->getHeader().contigs_);
+ (entry.*a)(std::cout, this->getHeader().contigs_);
}
return true;
@@ -698,7 +796,6 @@ bool TomahawkOutputReader::__concat(const std::vector& files, const
if(!SILENT)
std::cerr << helpers::timestamp("LOG", "CONCAT") << "Opening input: " << files[i] << "..." << std::endl;
- this->stream_.close();
self_type second_reader;
if(!second_reader.open(files[i])){
std::cerr << helpers::timestamp("ERROR","TWO") << "Failed to parse: " << files[i] << "..." << std::endl;
@@ -710,7 +807,9 @@ bool TomahawkOutputReader::__concat(const std::vector& files, const
}
while(second_reader.parseBlock())
- writer << this->data_;
+ writer << second_reader.data_;
+
+ writer.flush();
}
writer.setSorted(false);
@@ -739,7 +838,7 @@ bool TomahawkOutputReader::concat(const std::string& file_list, const std::strin
std::ifstream file_list_read(file_list);
if(!file_list_read.good()){
- std::cerr << helpers::timestamp("ERROR","TWO") << "Failed to get file_list..." << std::endl;
+ std::cerr << helpers::timestamp("ERROR","TWO") << "Failed to get file list..." << std::endl;
return false;
}
@@ -747,8 +846,8 @@ bool TomahawkOutputReader::concat(const std::string& file_list, const std::strin
std::string line;
while(getline(file_list_read, line)){
if(line.size() == 0){
- std::cerr << helpers::timestamp("WARNING","TWO") << "Empty line" << std::endl;
- break;
+ std::cerr << helpers::timestamp("WARNING","TWO") << "Empty line. Attempting to continue..." << std::endl;
+ continue;
}
files.push_back(line);
}
diff --git a/src/tomahawk/two/TomahawkOutputReader.h b/src/tomahawk/two/TomahawkOutputReader.h
index 3d44373..58d3a6b 100644
--- a/src/tomahawk/two/TomahawkOutputReader.h
+++ b/src/tomahawk/two/TomahawkOutputReader.h
@@ -38,7 +38,6 @@ class TomahawkOutputReader {
typedef io::BasicBuffer buffer_type;
typedef io::TGZFController tgzf_controller_type;
typedef totempole::Footer footer_type;
-
typedef algorithm::IntervalTree tree_type;
typedef hash::HashTable hash_table;
@@ -55,9 +54,19 @@ class TomahawkOutputReader {
inline header_type& getHeader(void){ return(this->header_); }
inline index_type* getIndexPointer(void){ return(this->index_); }
+ /**<
+ * Primary function to open a TWO file
+ * @param input Input string file location
+ * @return Returns TRUE if basic parsing is successful or FALSE otherwise
+ */
bool open(const std::string input);
+
+ /**<
+ * Adds interval regions in unparsed string format.
+ * @param positions Input vector of unparsed intervals
+ * @return Returns TRUE if parsing is successful or FALSE otherwise
+ */
bool addRegions(std::vector& positions);
- //bool OpenExtend(const std::string input);
// Streaming functions
/**<
@@ -137,18 +146,20 @@ class TomahawkOutputReader {
bool __viewFilter(void);
bool __viewRegion(void);
- bool __checkRegionNoIndex(const entry_type& entry);
+ bool __checkRegionUnsorted(const entry_type& entry);
+ bool __checkRegionSorted(const entry_type& entry);
bool __concat(const std::vector& files, const std::string& output);
bool __addRegions(std::vector& positions);
- bool __ParseRegion(const std::string& region, interval_type& interval);
- bool __ParseRegionIndexed(const std::string& region, interval_type& interval);
- bool __ParseRegionIndexedBlocks(void);
+ //bool __ParseRegion(const std::string& region, interval_type& interval);
+ bool __ParseRegion(const std::string& region, interval_type& interval) const;
+ //bool __ParseRegionIndexedBlocks(void);
public:
U64 filesize_; // filesize
U64 offset_end_of_data_;
bool showHeader_; // flag to output header or not
+ bool output_json_;
std::ifstream stream_; // reader stream
header_type header_;
@@ -162,8 +173,8 @@ class TomahawkOutputReader {
filter_type filters_; // filter parameters
- tree_type** interval_tree;
- std::vector* interval_tree_entries;
+ tree_type** interval_tree; // actual interval trees
+ std::vector* interval_tree_entries; // entries for interval trees
};
} /* namespace Tomahawk */
diff --git a/src/tomahawk/two/output_entry.h b/src/tomahawk/two/output_entry.h
index b8281d1..0fe72f3 100644
--- a/src/tomahawk/two/output_entry.h
+++ b/src/tomahawk/two/output_entry.h
@@ -81,6 +81,13 @@ struct __attribute__((packed, aligned(1))) OutputEntry{
return(os);
}
+ std::ostream& writeJSON(std::ostream& os, const contig_type* contigs) const{
+ os << '[' << std::setprecision(8) << (int)this->FLAGS << ",\"" << contigs[this->AcontigID].name << "\"," << this->Aposition << ",\"" << contigs[this->BcontigID].name << "\"," << this->Bposition
+ << ',' << this->p1 << ',' << this->p2 << ',' << this->q1 << ',' << this->q2 << ',' << this->D << ',' << this->Dprime
+ << ',' << this->R << ',' << this->R2 << ',' << this->P << ',' << this->chiSqModel << ',' << this->chiSqFisher << "]";
+ return(os);
+ }
+
// Write to buffer
friend buffer_type& operator<<(buffer_type& b, const self_type& entry){
b.Add(reinterpret_cast(&entry), sizeof(self_type));
@@ -93,7 +100,7 @@ struct __attribute__((packed, aligned(1))) OutputEntry{
U32 Amissing: 1, Aphased: 1, Aposition: 30;
U32 BcontigID;
U32 Bmissing: 1, Bphased: 1, Bposition: 30;
- float p1, p2, q1, q2;
+ double p1, p2, q1, q2;
float D, Dprime; // D and D'
float R, R2; // Correlation coefficient
double P; // P-value
diff --git a/src/tomahawk/two/output_entry_support.h b/src/tomahawk/two/output_entry_support.h
index 607bc81..6219717 100644
--- a/src/tomahawk/two/output_entry_support.h
+++ b/src/tomahawk/two/output_entry_support.h
@@ -22,6 +22,7 @@ struct OutputEntrySupport{
this->alleleCounts[4] = 0;
this->alleleCounts[5] = 0;
this->chiSqModel = 0;
+ this->R2 = 0;
// All other values can legally overflow
// They are not used
}
@@ -48,8 +49,8 @@ struct OutputEntrySupport{
// They are not used
}
- inline float& operator[](const U32& p){ return(this->alleleCounts[p]); }
- inline const float& operator[](const U32& p) const{ return(this->alleleCounts[p]); }
+ inline double& operator[](const U32& p){ return(this->alleleCounts[p]); }
+ inline const double& operator[](const U32& p) const{ return(this->alleleCounts[p]); }
void operator=(const OutputEntrySupport& other);
void printUnphasedCounts(void) const;
void printPhasedCounts(void) const;
@@ -91,15 +92,15 @@ struct OutputEntrySupport{
inline void setFailedHWEA(const bool yes = true) { this->controller |= yes << 12; } // 4096
inline void setFailedHWEB(const bool yes = true) { this->controller |= yes << 13; } // 8192
- inline const float getTotalAltHaplotypeCount(void) const{
+ inline const double getTotalAltHaplotypeCount(void) const{
// Find largest
- const float* max = &this->alleleCounts[0];
+ const double* max = &this->alleleCounts[0];
if(this->alleleCounts[1] > *max) max = &this->alleleCounts[1];
if(this->alleleCounts[4] > *max) max = &this->alleleCounts[4];
if(this->alleleCounts[5] > *max) max = &this->alleleCounts[5];
// Count number of non-major allele combinations
- float max2 = 0;
+ double max2 = 0;
if(&this->alleleCounts[0] != max) max2 += this->alleleCounts[0];
if(&this->alleleCounts[1] != max) max2 += this->alleleCounts[1];
if(&this->alleleCounts[4] != max) max2 += this->alleleCounts[4];
@@ -115,11 +116,11 @@ struct OutputEntrySupport{
double P; // Fisher or Chi-Squared P value for 2x2 contingency table
double chiSqModel; // Chi-Squared critical value for 3x3 contingency table
double chiSqFisher; // Chi-Squared critical value for 2x2 contingency table
- float totalHaplotypeCounts; // Total number of alleles
+ double totalHaplotypeCounts; // Total number of alleles
// Counters
- float alleleCounts[171];
- float haplotypeCounts[4];
+ double alleleCounts[171];
+ double haplotypeCounts[4];
};
} /* namespace Support */
diff --git a/src/tomahawk/two/output_filter.cpp b/src/tomahawk/two/output_filter.cpp
index cfb4639..1d890a7 100644
--- a/src/tomahawk/two/output_filter.cpp
+++ b/src/tomahawk/two/output_filter.cpp
@@ -6,6 +6,7 @@ namespace tomahawk {
OutputFilter::OutputFilter() :
any_filter_user_set(false),
+ upper_triangular_only(false),
minP1(0),
minP2(0),
minQ1(0),
@@ -18,10 +19,11 @@ OutputFilter::OutputFilter() :
maxMHF(std::numeric_limits::max()),
minD(-100), maxD(100),
minDprime(-100), maxDprime(100),
+ minR(0), maxR(100),
minR2(0), maxR2(100),
minP(0), maxP(100),
- minChiSquared(0), maxChiSquared(std::numeric_limits::max()),
- minPmodel(0), maxPmodel(std::numeric_limits::max()),
+ minChiSquaredTable(0), maxChiSquaredTable(std::numeric_limits::max()),
+ minChiSquaredModel(0), maxChiSquaredModel(std::numeric_limits::max()),
filterValueInclude(0),
filterValueExclude(0)
{}
@@ -31,6 +33,7 @@ OutputFilter::~OutputFilter(){}
std::string OutputFilter::getInterpretedString(void) const{
if(this->any_filter_user_set){
return(std::string(
+ "upperTriangular=" + (this->upper_triangular_only ? std::string("TRUE") : std::string("FALSE")) +
"minP1=" + std::to_string(this->minP1) + " " +
"minP2=" + std::to_string(this->minP2) + " " +
"minQ1=" + std::to_string(this->minQ1) + " " +
@@ -47,12 +50,14 @@ std::string OutputFilter::getInterpretedString(void) const{
"maxDprime=" + std::to_string(this->maxDprime) + " " +
"minR2=" + std::to_string(this->minR2) + " " +
"maxR2=" + std::to_string(this->maxR2) + " " +
+ "minR=" + std::to_string(this->minR) + " " +
+ "maxR=" + std::to_string(this->maxR) + " " +
"minP=" + std::to_string(this->minP) + " " +
"maxP=" + std::to_string(this->maxP) + " " +
- "minChiSquared=" + std::to_string(this->minChiSquared) + " " +
- "maxChiSquared=" + (this->maxChiSquared == std::numeric_limits::max() ? "DOUBLE_MAX" : std::to_string(this->maxChiSquared)) + " " +
- "minPmodel=" + std::to_string(this->minPmodel) + " " +
- "maxPmodel=" + std::to_string(this->maxPmodel) + " " +
+ "minChiSquaredTable=" + std::to_string(this->minChiSquaredTable) + " " +
+ "maxChiSquaredTable=" + (this->maxChiSquaredTable == std::numeric_limits::max() ? "DOUBLE_MAX" : std::to_string(this->maxChiSquaredTable)) + " " +
+ "minChiSquaredModel=" + std::to_string(this->minChiSquaredModel) + " " +
+ "maxChiSquaredModel=" + (this->maxChiSquaredModel == std::numeric_limits::max() ? "DOUBLE_MAX" : std::to_string(this->maxChiSquaredModel)) + " " +
"filterValueInclude=" + std::to_string(this->filterValueInclude) + " " +
"filterValueExclude=" + std::to_string(this->filterValueExclude)
));
@@ -62,15 +67,22 @@ std::string OutputFilter::getInterpretedString(void) const{
}
bool OutputFilter::filter(const entry_type& target) const{
+ if(this->upper_triangular_only){
+ if(target.AcontigID == target.BcontigID && target.Aposition > target.Bposition){
+ return false;
+ }
+ }
+
if(((target.FLAGS & this->filterValueInclude) != this->filterValueInclude) || ((target.FLAGS & this->filterValueExclude) != 0))
return false;
+ if(!this->filter(target.R, this->minR, this->maxR)) return false;
if(!this->filter(target.R2, this->minR2, this->maxR2)) return false;
if(!this->filter(target.P, this->minP, this->maxP)) return false;
if(!this->filter(target.D, this->minD, this->maxD)) return false;
if(!this->filter(target.Dprime, this->minDprime, this->maxDprime)) return false;
- if(!this->filter(target.chiSqModel, this->minChiSquared, this->maxChiSquared)) return false;
- if(!this->filter(target.chiSqFisher, this->minPmodel, this->maxPmodel)) return false;
+ if(!this->filter(target.chiSqModel, this->minChiSquaredModel, this->maxChiSquaredModel)) return false;
+ if(!this->filter(target.chiSqFisher, this->minChiSquaredTable, this->maxChiSquaredTable)) return false;
if(!this->filterJointHF(target)) return false;
if(!this->filter(target.p1, this->minP1, this->maxP1)) return false;
if(!this->filter(target.p2, this->minP2, this->maxP2)) return false;
@@ -86,13 +98,13 @@ bool OutputFilter::filterHF(const entry_type& target) const{
bool OutputFilter::filterJointHF(const entry_type& target) const{
// find largest
- const float* max = &target.p1;
+ const double* max = &target.p1;
if(target.p2 > *max) max = &target.p2;
if(target.q1 > *max) max = &target.q1;
if(target.q2 > *max) max = &target.q2;
// sum of cells excluding largest
- float total = 0;
+ double total = 0;
if(&target.p1 != max) total += target.p1;
if(&target.p2 != max) total += target.p2;
if(&target.q1 != max) total += target.q1;
@@ -177,6 +189,27 @@ bool OutputFilter::setFilterDprime(const float& min, const float& max){
return true;
}
+bool OutputFilter::setFilterR(const float& min, const float& max){
+ if(max < min){
+ std::cerr << "max < min" << std::endl;
+ return false;
+ }
+ if(min < 0 || max < 0){
+ std::cerr << "has negative" << std::endl;
+ return false;
+ }
+ if(min > 1 || max > 1){
+ std::cerr << "value > 1" << std::endl;
+ return false;
+ }
+
+ this->minR = min - constants::ALLOWED_ROUNDING_ERROR;
+ this->maxR = max + constants::ALLOWED_ROUNDING_ERROR;
+ this->trigger();
+ return true;
+}
+
+
bool OutputFilter::setFilterRsquared(const float& min, const float& max){
if(max < min){
std::cerr << "max < min" << std::endl;
@@ -217,7 +250,7 @@ bool OutputFilter::setFilterP(const double& min, const double& max){
return true;
}
-bool OutputFilter::setFilterPmodel(const double& min, const double& max){
+bool OutputFilter::setFilterChiSquaredTable(const double& min, const double& max){
if(max < min){
std::cerr << "max < min" << std::endl;
return false;
@@ -226,18 +259,14 @@ bool OutputFilter::setFilterPmodel(const double& min, const double& max){
std::cerr << "has negative" << std::endl;
return false;
}
- if(min > 1 || max > 1){
- std::cerr << "value > 1" << std::endl;
- return false;
- }
- this->minPmodel = min;
- this->maxPmodel = max;
+ this->minChiSquaredTable = min;
+ this->maxChiSquaredTable = max;
this->trigger();
return true;
}
-bool OutputFilter::setFilterChiSquared(const double& min, const double& max){
+bool OutputFilter::setFilterChiSquaredModel(const double& min, const double& max){
if(max < min){
std::cerr << "max < min" << std::endl;
return false;
@@ -247,8 +276,8 @@ bool OutputFilter::setFilterChiSquared(const double& min, const double& max){
return false;
}
- this->minChiSquared = min;
- this->maxChiSquared = max;
+ this->minChiSquaredModel = min;
+ this->maxChiSquaredModel = max;
this->trigger();
return true;
}
diff --git a/src/tomahawk/two/output_filter.h b/src/tomahawk/two/output_filter.h
index 73e987e..9bc704d 100644
--- a/src/tomahawk/two/output_filter.h
+++ b/src/tomahawk/two/output_filter.h
@@ -24,12 +24,16 @@ class OutputFilter {
bool setFilterTable(const double& a, const double& b, const double& c, const double& d);
bool setFilterTable(const double& all);
+ inline void setFilterUpperTriangular(const bool set){ this->upper_triangular_only = set; }
bool setFilterD(const float& min, const float& max);
bool setFilterDprime(const float& min, const float& max);
+ bool setFilterR(const float& min, const float& max);
bool setFilterRsquared(const float& min, const float& max);
bool setFilterP(const double& min, const double& max);
bool setFilterPmodel(const double& min, const double& max);
- bool setFilterChiSquared(const double& min, const double& max);
+ bool setFilterChiSquaredTable(const double& min, const double& max);
+ bool setFilterChiSquaredModel(const double& min, const double& max);
+
void setFilterInclude(const U16& val){ this->filterValueInclude = val; this->trigger(); }
void setFilterExclude(const U16& val){ this->filterValueExclude = val; this->trigger(); }
bool setFilterMHF(const double& min, const double& max);
@@ -44,15 +48,10 @@ class OutputFilter {
bool filterJointHF(const entry_type& target) const;
bool filterHF(const entry_type& target) const;
- bool filterD(const entry_type& type) const;
- bool filterDprime(const entry_type& type) const;
- bool filterRsquared(const entry_type& type) const;
- bool filterP(const entry_type& type) const;
- bool filterPmodel(const entry_type& type) const;
- bool filterFLAG(const entry_type& type) const;
public:
bool any_filter_user_set;
+ bool upper_triangular_only;
// Filters
double minP1, minP2, minQ1, minQ2;
@@ -60,10 +59,11 @@ class OutputFilter {
double minMHF, maxMHF;
float minD, maxD;
float minDprime, maxDprime;
+ double minR, maxR;
double minR2, maxR2;
double minP, maxP;
- double minChiSquared, maxChiSquared;
- double minPmodel, maxPmodel;
+ double minChiSquaredTable, maxChiSquaredTable;
+ double minChiSquaredModel, maxChiSquaredModel;
U16 filterValueInclude;
U16 filterValueExclude;
diff --git a/src/utility.h b/src/utility.h
index d223c67..5d808dc 100644
--- a/src/utility.h
+++ b/src/utility.h
@@ -18,7 +18,7 @@ void programMessage(const bool separator = true){
void programHelp(void){
std::cerr << "Usage: " << tomahawk::constants::PROGRAM_NAME << " [--version] [--help] " << std::endl;
- std::cerr << "Commands: import, view, calc, sort, index, stats, concat" << std::endl;
+ std::cerr << "Commands: import, view, calc, sort, concat" << std::endl;
}
void programHelpDetailed(void){
@@ -26,9 +26,8 @@ void programHelpDetailed(void){
std::cerr <<
"\n"
"concat concatenate TWO files from the same set of samples\n"
- "calc calculate linkage disequilibrium (TWO/TOI format)\n"
- "import import VCF/BCF to TWK/TWI\n"
- "stats basic statistics of TWK/TWO\n"
+ "calc calculate linkage disequilibrium\n"
+ "import import VCF/BCF to TWK\n"
"sort sort TWO file\n"
"view TWK->VCF conversion, TWO/TWK view, TWK/TWO subset and filter" << std::endl;
}
diff --git a/src/view.h b/src/view.h
index 527d161..651a371 100644
--- a/src/view.h
+++ b/src/view.h
@@ -1,5 +1,5 @@
/*
-Copyright (C) 2016-2017 Genome Research Ltd.
+Copyright (C) 2016-2018 Genome Research Ltd.
Author: Marcus D. R. Klarqvist
Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -30,18 +30,17 @@ DEALINGS IN THE SOFTWARE.
void view_usage(void){
programMessage();
std::cerr <<
- "About: Convert binary TWK->VCF or TWO->LD; subset and slice TWK/TWO data\n"
- " Data does not have to be indexed. However, operations are faster if they\n"
- " are.\n"
+ "About: Convert binary TWK->VCF or TWO->LD, subset and slice TWK/TWO data\n\n"
"Usage: " << tomahawk::constants::PROGRAM_NAME << " view [options] -i \n\n"
"Options:\n"
" -i FILE input Tomahawk (required)\n"
" -o FILE output file (- for stdout; default: -)\n"
" -h/H (twk/two) header only / no header [null]\n"
- " -O char output type: b for TWO format, n for tab-delimited format (default: b)\n"
+ " -O char output type: b for TWO format, n for tab-delimited format\n"
" -N output in tab-delimited text format (see -O) [null]\n"
" -B output in binary TWO/TWK format (see -O, default)[null]\n"
" -I STRING filter interval :pos-pos (see manual)\n"
+ //" -J output JSON object\n"
" -s Hide all program messages [null]\n\n"
// Twk parameters
@@ -50,31 +49,31 @@ void view_usage(void){
// Two parameters
"Two parameters\n"
- " -r, --minR2 FLOAT Pearson's R-squared minimum cut-off value (default: 0.1)\n"
- " -R, --maxR2 FLOAT Pearson's R-squared maximum cut-off value (default: 1.0)\n"
+ " -r, --minR2 FLOAT Pearson's R-squared minimum cut-off value\n"
+ " -R, --maxR2 FLOAT Pearson's R-squared maximum cut-off value\n"
+ " -z, --minR FLOAT Pearson's R minimum cut-off value\n"
+ " -Z, --maxR FLOAT Pearson's R maximum cut-off value\n"
" -p, --minP FLOAT smallest P-value (default: 0)\n"
" -P, --maxP FLOAT largest P-value (default: 1)\n"
" -d, --minD FLOAT smallest D value (default: -1)\n"
" -D, --maxD FLOAT largest D value (default: 1)\n"
" -b, --minDP FLOAT smallest D' value (default: 0)\n"
" -B, --maxDP FLOAT largest D' value (default: 1)\n"
- " -x, --minChi FLOAT smallest Chi-squared CV (default: 0)\n"
- " -X, --maxChi FLOAT largest Chi-squared CV (default: inf)\n"
+ " -x, --minChi FLOAT smallest Chi-squared CV of contingency table (default: 0)\n"
+ " -X, --maxChi FLOAT largest Chi-squared CV of contingency table (default: inf)\n"
" -a, --minMHC FLOAT minimum minor-haplotype count (default: 0)\n"
" -A, --maxMHC FLOAT maximum minor-haplotype count (default: inf)\n"
- " -m, --minMP FLOAT smallest model Chi-squared CV (default: 0)\n"
- " -M, --maxMP FLOAT largest model Chi-squared CV (default: inf)\n"
+ " -m, --minMCV FLOAT smallest Chi-squared CV of unphased model (default: 0)\n"
+ " -M, --maxMCV FLOAT largest Chi-squared CV of unphased model (default: inf)\n"
" -f INT include FLAG value\n"
" -F INT exclude FLAG value\n"
- " -u output only the upper triangular values\n"
- " --min FLOAT smallest cell count (default: 0)\n"
- " --max FLOAT largest cell count (default: inf)\n";
+ " -u output only the upper triangular values\n";
}
int view(int argc, char** argv){
if(argc < 3){
view_usage();
- return(1);
+ return(0);
}
static struct option long_options[] = {
@@ -82,22 +81,28 @@ int view(int argc, char** argv){
{"output", optional_argument, 0, 'o' },
{"minP", optional_argument, 0, 'p' },
{"maxP", optional_argument, 0, 'P' },
+ {"minR", optional_argument, 0, 'z' },
+ {"maxR", optional_argument, 0, 'Z' },
{"minR2", optional_argument, 0, 'r' },
{"maxR2", optional_argument, 0, 'R' },
- {"minDP", optional_argument, 0, 'd' },
- {"maxDP", optional_argument, 0, 'D' },
+ {"minDP", optional_argument, 0, 'b' },
+ {"maxDP", optional_argument, 0, 'B' },
+ {"minD", optional_argument, 0, 'd' },
+ {"maxD", optional_argument, 0, 'D' },
{"minChi", optional_argument, 0, 'x' },
{"maxChi", optional_argument, 0, 'X' },
{"minAlelles", optional_argument, 0, 'a' },
{"maxAlleles", optional_argument, 0, 'A' },
- {"minMP", optional_argument, 0, 'm' },
- {"maxMP", optional_argument, 0, 'M' },
+ {"minMCV", optional_argument, 0, 'm' },
+ {"maxMCV", optional_argument, 0, 'M' },
+ {"JSON", optional_argument, 0, 'J' },
{"flagInclude", optional_argument, 0, 'f' },
{"flagExclude", optional_argument, 0, 'F' },
+ {"upperTriangular", no_argument, 0, 'u' },
{"headerOnly", no_argument, 0, 'H' },
{"noHeader", no_argument, 0, 'h' },
{"dropGenotypes",optional_argument, 0, 'G' },
- {"interval", optional_argument, 0, 'I' },
+ {"interval", optional_argument, 0, 'I' },
{"silent", no_argument, 0, 's' },
{0,0,0,0}
};
@@ -106,14 +111,14 @@ int view(int argc, char** argv){
std::string input, output;
tomahawk::OutputFilter two_filter;
bool outputHeader = true;
- int outputType = 1;
bool dropGenotypes = false;
std::vector filter_regions;
+ //bool output_JSON = false;
int c = 0;
int long_index = 0;
int hits = 0;
- while ((c = getopt_long(argc, argv, "i:o:r:R:p:P:d:D:x:X:a:A:m:M:f:F:I:HhGsBN", long_options, &long_index)) != -1){
+ while ((c = getopt_long(argc, argv, "i:o:r:R:p:P:d:D:x:X:a:A:m:M:f:F:I:HhGsb:B:Nuz:Z:", long_options, &long_index)) != -1){
hits += 2;
switch (c){
case ':': /* missing option argument */
@@ -136,6 +141,14 @@ int view(int argc, char** argv){
case 'I':
filter_regions.push_back(std::string(optarg));
break;
+ //case 'J':
+ // output_JSON = true;
+ // --hits;
+ // break;
+ case 'u':
+ two_filter.setFilterUpperTriangular(true);
+ --hits;
+ break;
case 'r':
two_filter.minR2 = atof(optarg);
if(two_filter.minR2 < 0){
@@ -160,26 +173,51 @@ int view(int argc, char** argv){
}
two_filter.trigger();
break;
+
+ case 'z':
+ two_filter.minR = atof(optarg);
+ if(two_filter.minR < -1){
+ std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter z cannot be < -1" << std::endl;
+ return(1);
+ }
+ if(two_filter.minR > 1){
+ std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter z has to be in range -1 < r < 1" << std::endl;
+ return(1);
+ }
+ two_filter.trigger();
+ break;
+ case 'Z':
+ two_filter.maxR = atof(optarg);
+ if(two_filter.maxR < 0){
+ std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter Z cannot be negative" << std::endl;
+ return(1);
+ }
+ if(two_filter.maxR > 1){
+ std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter Z has to be in range 0 < R < 1" << std::endl;
+ return(1);
+ }
+ two_filter.trigger();
+ break;
case 'd':
- two_filter.minDprime = atof(optarg);
- if(two_filter.minDprime < 0){
- std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter d cannot be negative" << std::endl;
+ two_filter.minD = atof(optarg);
+ if(two_filter.minD < -1){
+ std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter d cannot be < -1" << std::endl;
return(1);
}
- if(two_filter.minDprime > 1){
- std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter d has to be in range 0 < d < 1" << std::endl;
+ if(two_filter.minD > 1){
+ std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter d has to be in range -1 < d < 1" << std::endl;
return(1);
}
two_filter.trigger();
break;
case 'D':
- two_filter.maxDprime = atof(optarg);
- if(two_filter.maxDprime < 0){
- std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter D cannot be negative" << std::endl;
+ two_filter.maxD = atof(optarg);
+ if(two_filter.maxD < -1){
+ std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter D cannot be < -1" << std::endl;
return(1);
}
- if(two_filter.maxDprime > 1){
- std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter D has to be in range 0 < D < 1" << std::endl;
+ if(two_filter.maxD > 1){
+ std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter D has to be in range -1 < D < 1" << std::endl;
return(1);
}
two_filter.trigger();
@@ -210,16 +248,16 @@ int view(int argc, char** argv){
break;
case 'x':
- two_filter.minChiSquared = atof(optarg);
- if(two_filter.minChiSquared < 0){
+ two_filter.minChiSquaredTable = atof(optarg);
+ if(two_filter.minChiSquaredTable < 0){
std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter x cannot be negative" << std::endl;
return(1);
}
two_filter.trigger();
break;
case 'X':
- two_filter.maxChiSquared = atof(optarg);
- if(two_filter.maxChiSquared <= 0){
+ two_filter.maxChiSquaredTable = atof(optarg);
+ if(two_filter.maxChiSquaredTable <= 0){
std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter X cannot be <= 0" << std::endl;
return(1);
}
@@ -227,16 +265,16 @@ int view(int argc, char** argv){
break;
case 'm':
- two_filter.minPmodel = atof(optarg);
- if(two_filter.minPmodel < 0){
+ two_filter.minChiSquaredModel = atof(optarg);
+ if(two_filter.minChiSquaredModel < 0){
std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter m cannot be negative" << std::endl;
return(1);
}
two_filter.trigger();
break;
case 'M':
- two_filter.maxPmodel = atof(optarg);
- if(two_filter.maxPmodel <= 0){
+ two_filter.maxChiSquaredModel = atof(optarg);
+ if(two_filter.maxChiSquaredModel <= 0){
std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter M cannot be <= 0" << std::endl;
return(1);
}
@@ -280,16 +318,30 @@ int view(int argc, char** argv){
outputHeader = false;
--hits;
break;
- case 'N':
- outputType = 1;
- --hits;
+
+ case 'b':
+ two_filter.minDprime = atof(optarg);
+ if(two_filter.minDprime < -1){
+ std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter b cannot be < -1" << std::endl;
+ return(1);
+ }
+ if(two_filter.minDprime > 1){
+ std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter b has to be in range -1 < d < 1" << std::endl;
+ return(1);
+ }
+ two_filter.trigger();
break;
case 'B':
- outputType = 0;
- --hits;
- break;
- case 'O':
- outputType = atoi(optarg);
+ two_filter.maxDprime = atof(optarg);
+ if(two_filter.maxDprime < -1){
+ std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter B cannot be < -1" << std::endl;
+ return(1);
+ }
+ if(two_filter.maxDprime > 1){
+ std::cerr << tomahawk::helpers::timestamp("ERROR") << "Parameter B has to be in range -1 < D < 1" << std::endl;
+ return(1);
+ }
+ two_filter.trigger();
break;
case 'G':
@@ -327,6 +379,7 @@ int view(int argc, char** argv){
return 1;
}
+ //tomahawk.summaryIndividuals();
tomahawk.outputBlocks();
} else if(end == tomahawk::constants::OUTPUT_LD_SUFFIX){
@@ -335,6 +388,8 @@ int view(int argc, char** argv){
tomahawk::OutputFilter& filter = reader.getFilter();
filter = tomahawk::OutputFilter(two_filter); // use copy ctor to transfer data
+ //reader.output_json_ = output_JSON;
+
if(!reader.open(input))
return 1;
diff --git a/test_files/import/bcf/1kgp3_chr20_10k_lines.bcf b/test_files/import/bcf/1kgp3_chr20_10k_lines.bcf
new file mode 100644
index 0000000..1a5e199
Binary files /dev/null and b/test_files/import/bcf/1kgp3_chr20_10k_lines.bcf differ
diff --git a/test_files/view/1kgp3_chr20_10k_lines.ld b/test_files/view/1kgp3_chr20_10k_lines.ld
new file mode 100644
index 0000000..a49d9fb
--- /dev/null
+++ b/test_files/view/1kgp3_chr20_10k_lines.ld
@@ -0,0 +1,204068 @@
+##fileformat=VCFv4.1
+##FILTER=
+##fileDate=20150218
+##reference=ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz
+##source=1000GenomesPhase3Pipeline
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig= | |