diff --git a/.settings/language.settings.xml b/.settings/language.settings.xml
index 1df044e..888f6e2 100644
--- a/.settings/language.settings.xml
+++ b/.settings/language.settings.xml
@@ -5,7 +5,7 @@
-
+
diff --git a/.travis.yml b/.travis.yml
index e915788..8953684 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -38,25 +38,7 @@ matrix:
env:
- MATRIX_EVAL="CC=gcc-6 && CXX=g++-6"
compiler: gcc
-
- - os: osx
- osx_image: xcode8
- env:
- - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9"
- compiler: gcc
-
- - os: osx
- osx_image: xcode8
- env:
- - MATRIX_EVAL="brew install gcc5 && CC=gcc-5 && CXX=g++-5"
- compiler: gcc
-
- - os: osx
- osx_image: xcode8
- env:
- - MATRIX_EVAL="brew install gcc && CC=gcc-6 && CXX=g++-6"
- compiler: gcc
-
+
# works on Precise and Trusty
- os: linux
addons:
diff --git a/R/1kgp3_chr20_105_1.jpeg b/R/1kgp3_chr20_105_1.jpeg
new file mode 100644
index 0000000..fbe9162
Binary files /dev/null and b/R/1kgp3_chr20_105_1.jpeg differ
diff --git a/R/1kgp3_chr20_105_1_symmetric.jpeg b/R/1kgp3_chr20_105_1_symmetric.jpeg
new file mode 100644
index 0000000..6aa970f
Binary files /dev/null and b/R/1kgp3_chr20_105_1_symmetric.jpeg differ
diff --git a/R/1kgp3_chr20_105_1_triangular.jpeg b/R/1kgp3_chr20_105_1_triangular.jpeg
new file mode 100644
index 0000000..b757172
Binary files /dev/null and b/R/1kgp3_chr20_105_1_triangular.jpeg differ
diff --git a/R/1kgp3_chr20_45_part1_10.jpeg b/R/1kgp3_chr20_45_part1_10.jpeg
new file mode 100644
index 0000000..adcb513
Binary files /dev/null and b/R/1kgp3_chr20_45_part1_10.jpeg differ
diff --git a/R/1kgp3_chr20_large_region.jpeg b/R/1kgp3_chr20_large_region.jpeg
new file mode 100644
index 0000000..ac56b45
Binary files /dev/null and b/R/1kgp3_chr20_large_region.jpeg differ
diff --git a/R/example_region.R b/R/example_region.R
new file mode 100644
index 0000000..9f731b8
--- /dev/null
+++ b/R/example_region.R
@@ -0,0 +1,25 @@
+# Specify colour scheme
+colors<-paste0(colorRampPalette(c("blue","red"))(10),seq(0,100,length.out = 11))
+colors[1]<-paste0(colors[1],"0")
+colors[length(colors)]<- substr(colors[length(colors)],1,7)
+
+# Define support functions
+plotLDRegion<-function(dataSource, from, to, ...){
+ # B is A but sorted for plotting reasons (Z-stack)
+ b<-dataSource[dataSource$V3>=from & dataSource$V3 <= to & dataSource$V5 >= from & dataSource$V5 <= to,]
+ b<-b[order(b$V12,decreasing = F),]
+ plot(b$V3,b$V5,pch=20,cex=.1,col=colors[cut(b$V12,breaks=seq(0,1,length.out = 11),include.lowest = T)],xlim=c(from,to),ylim=c(from,to),xaxs="i",yaxs="i", ...)
+}
+
+plotLDRegionTriangular<-function(dataSource, from, to, ...){
+ # B is A but sorted for plotting reasons (Z-stack)
+ b<-dataSource[dataSource$V3>=from & dataSource$V5<=to & dataSource$V3>=from & dataSource$V5<=to,]
+ b<-b[b$V3sum(b)*0.8)[1],col="red",lwd=2)
-abline(v=which(cumsum(b)>sum(b)*0.9)[1],col="pink",lwd=2)
-abline(v=which(cumsum(b)>sum(b)*0.95)[1],col="yellow",lwd=2)
diff --git a/R/plot_functions.R b/R/plot_functions.R
deleted file mode 100644
index abab757..0000000
--- a/R/plot_functions.R
+++ /dev/null
@@ -1,156 +0,0 @@
-colors<-paste0(colorRampPalette(c("blue","red"))(10),seq(0,100,length.out = 11))
-colors[1]<-paste0(colors[1],"0")
-colors[length(colors)]<- substr(colors[length(colors)],1,7)
-
-occurences<-sort(table(ld$V4),decreasing = T)
-multiples<-as.numeric(names(occurences[occurences>5]))
-#Decay
-decay<-function(pos, ...){
- par(mar=c(2,2,2,2))
- plot(ld[ld$V4==multiples[pos],6] - ld[ld$V4==multiples[pos],4],ld[ld$V4==multiples[pos],13],pch=20,ylim=c(0,1),cex=(-10*(log10(.5)+log10(.5)))/ld[ld$V4==multiples[pos],2], ...)
-}
-decay(1)
-
-##test
-testModel<-function(pos){
- dat<-data.frame("y"=a[a$V3==multiples[pos],12],
- "x"=a[a$V3==multiples[pos],5])
- dat$x<-dat$x-min(dat$x)+1
-
- #plot(dat$x,dat$y)
- mod <- nls(y ~ a*x^(-a*b), data = dat, start = list(a = 1, b = 0.15),algorithm="port",weights = a[a$V3==multiples[pos],12])
- # plot decay
- modelRatio<-(coef(mod)["a"]*dat$x^(-coef(mod)["a"]*coef(mod)["b"]))/mean(dat$y)
-
- par(mfrow=c(2,1))
- decay(pos,col=c("blue","red")[as.factor(modelRatio>2)])
- lines(a[a$V3==multiples[pos],5],coef(mod)["a"]*1/dat$x^coef(mod)["b"],lwd=2,col="red")
- abline(h=mean(dat$y),lwd=2,col="blue",lty="dashed")
- plot(modelRatio,type="l")
-}
-
-plot(a$V3,a$V5,pch=20,cex=.5,col=colors[cut(a$V12,breaks=seq(0,1,length.out = 11),include.lowest = T)],xlim=c(750e3,950e3),ylim=c(750e3,950e3))
-
-#
-estimator<-function(pos){
- a1<- 1 - pnorm((a[a$V3==multiples[pos],12]-mean(a[a$V3==multiples[pos],12]))/sd(a[a$V3==multiples[pos],12]),lower.tail = F)
- b1<- -log10(a[a$V3==multiples[pos],13])
- #plot(a1)
- composite<- a1 * a[a$V3==multiples[pos],12]
- plot(composite,ylim=c(0,1))
- return(composite)
-}
-x<-seq(mean(a[a$V3==multiples[1],12])-3*sd(a[a$V3==multiples[1],12]),mean(a[a$V3==multiples[1],12])+3*sd(a[a$V3==multiples[1],12]),length=1000)
-plot(x,dnorm(x,mean(a[a$V3==multiples[1],12]),sd(a[a$V3==multiples[1],12])),type="l")
-
-gtf<-read.delim("~/Documents/Homo_sapiens.GRCh37.75_main.txt",head=F)
-
-from<-min(ld$V4[ld$V3==0&ld$V5==0])
-to<-max(ld$V6[ld$V3==0&ld$V5==0])
-
-plotLDRegion<-function(dataSource, from, to, ...){
- #temp<-gtf[(gtf$V4<=from>f$V5>from)|(gtf$V4>from>f$V4=from>f$V5<=to)|(gtf$V5>from>f$V5<=to),]
- #temp<-temp[temp$V2=="protein_coding",]
- #layout(mat = c(1,2,3),heights = c(1,2,8))
- #par(mar=c(0,3,3,3))
- #plot(-1,-1,ylim=c(0,1),xlim=c(min(temp$V4),max(temp$V5)),xaxt="n",xaxs="i")
- #rect(temp$V4,0.1,temp$V5,0.9,col="black")
- #par(mar=c(0,3,0,3))
- #plot(seq(from,to,by=1000)[-1],table(cut(dataSource[dataSource$V4>=from&dataSource$V6<=to,3],seq(from,to,by=1000))),pch=20,cex=1,xaxt="n",xaxs="i")
- #par(mar=c(3,3,0,3))
- # B is A but sorted for plotting reasons (Z-stack)
- b<-dataSource[dataSource$V2<30,]
- b<-b[order(b$V13,decreasing = F),]
- plot(b$V4,b$V6,pch=20,cex=.2,col=colors[cut(b$V13,breaks=seq(0,1,length.out = 11),include.lowest = T)],xlim=c(from,to),ylim=c(from,to),xaxs="i",yaxs="i", ...)
-}
-
-for(i in 1:49){
- from=(i-1)*5e6;
- to=(i*5e6);
- filename = sprintf("~/Desktop/chr1_slices/metabric/chr1_block%i.jpeg",i)
- jpeg(filename = filename,width = 2250,height = 1500, pointsize = 10,units = "px")
- #temp<-gtf[gtf$V1==1&((gtf$V4<=from>f$V5>from)|(gtf$V4>from>f$V4=from>f$V5<=to)|(gtf$V5>from>f$V5<=to)),]
- #temp<-temp[temp$V2=="protein_coding"&temp$V3=="gene",]
- #layout(mat = c(1,2,3),heights = c(1,2,8))
- #par(mar=c(0,3,3,3))
- #plot(-1,-1,ylim=c(0,1),xlim=c(from, to),xaxt="n",xaxs="i")
- #if(nrow(temp) != 0)
- # rect(temp$V4,0.1,temp$V5,0.9,col="black")
- #par(mar=c(0,3,0,3))
- #plot(seq(from,to,by=1000)[-1],table(cut(ld[ld$V4>=from&ld$V4<=to&ld$V6>=from&ld$V6<=to,4],seq(from,to,by=1000))),pch=20,cex=1,xaxt="n",xaxs="i")
-
- #layout(mat = c(1,2),heights = c(1,4))
- #par(mar=c(0,3,1,3))
- plot.new()
- pushViewport(viewport(y = 0.6, height = 0.2, width = 0.8, angle=135))
- plotRecomb(recomb, "chr1", from, to, xaxt='n', las=2, newpage=FALSE)
- #par(mar=c(0,3,0,3))
- #segRegion<-seg[seg$Chromosome==1&((seg$Start>=from&seg$Start<=to)|(seg$End>=from&seg$End<=to)),]
- #segRegion<-segRegion[abs(segRegion$Segment_Mean)>0.5,]
- recombTemp<-recomb[recomb$V3>=from&recomb$V4<=to&recomb$V2=="chr1",]
- #plot(-1,-1,xlim=c(from,to),ylim=c(min(segRegion$Segment_Mean),max(segRegion$Segment_Mean)),xaxs="i")
- #rect(from, -0.5, to, 0.5, col="lightgrey",border=NA)
- #points(segRegion$Start,segRegion$Segment_Mean,pch=20)
- #points(segRegion$End,segRegion$Segment_Mean,pch=17)
- #abline(v=recomb$V3[recomb$V2=="chr1"],col="grey",lty="dashed")
- par(mar=c(3,3,0,3))
- internal <- ld[ld$V4>=from-(to-from)&ld$V4<=to&ld$V6>=from&ld$V6<=to+(to-from),c(4,6,13)]
- plot(internal$V4+internal$V6,internal$V6-internal$V4,pch=20,cex=.25,col=colors[cut(internal$V13,breaks=seq(0,1,length.out = 11),include.lowest = T)],xlim=c(from*2,to*2),ylim=c(0, 500e3),xaxs="i",yaxs="i",las=2)
- abline(v=recomb$V3[recomb$V2=="chr1"],col="grey",lty="dashed")
- dev.off()
-}
-
-plot(unlist(lapply(split(test,test$V3),function(a)sum(a$V12>0.5))),pch=20,type="l")
-
-scaleFunction<-function(distance){
- if(distance > 50e3)
- return(0)
- return(1/50e3*(50e3-distance))
-}
-
-scoreFunction<-function(R2, startPos, endPosVector){
- vec <- rep(0, length(R2))
- M <- 0.6
- vec[1] = scaleFunction(endPosVector[1] - startPos) * R2[1] - M
- if(length(R2) == 1)
- return(vec)
-
- for(i in 2:length(R2)){
- vec[i] = vec[i-1] + (scaleFunction(endPosVector[i] - endPosVector[i-1]) * R2[i] - M)
- }
- return(vec)
-}
-
-kernel<-function(vector, bandwidth = 5){
- normaliser<-2*sum(3/4*(1-((0:(bandwidth-1))/bandwidth)^2))
-
- ret<-rep(0,length(vector))
- for(i in (bandwidth+2):(length(vector)-bandwidth-1)){
-
- curSum = 0
- offset = bandwidth
- for(j in (i-(bandwidth+1)):(i-1)){
- curSum = curSum + vector[j]*(3/4*(1-((offset-1)/bandwidth)^2))/normaliser
- offset = offset - 1;
- }
- curSum = curSum + vector[i]
- offset = 1;
- for(j in (i+1):(i+(bandwidth+1))){
- curSum = curSum + vector[j]*(3/4*(1-((offset-1)/bandwidth)^2))/normaliser
- offset = offset + 1
- }
- #cat(i,"/",length(ret),": ", ret[i])
- #cat(curSum)
-
- ret[i] = curSum
- }
- return(ret)
-}
-
-plotRecomb<-function(data,chr,from,to,...){
- dat<-data[data$V2==chr,]
- plot(-1,-1,xlim=c(from,to),ylim=c(0,100), xaxs="i",yaxs="i", ...)
- for(i in 1:nrow(dat)){
- rect(dat$V3[i],0,dat$V4[i],dat$V5[i],col = c("black","grey")[as.factor(dat$gender)])
- }
-}
\ No newline at end of file
diff --git a/README.md b/README.md
index 085b759..52bac6e 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,32 @@
-[![Build Status](https://travis-ci.org/mklarqvist/Tomahawk.svg?branch=master)](https://travis-ci.org/mklarqvist/Tomahawk)
-[![Release](https://img.shields.io/badge/Release-beta_0.1-blue.svg)](https://github.com/mklarqvist/Tomahawk/releases)
+[![Build Status](https://travis-ci.org/mklarqvist/tomahawk.svg?branch=master)](https://travis-ci.org/mklarqvist/tomahawk)
+[![Release](https://img.shields.io/badge/Release-beta_0.3-blue.svg)](https://github.com/mklarqvist/tomahawk/releases)
[![License](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
![screenshot](tomahawk.png)
## Fast calculation of LD in large-scale cohorts
-Tomahawk efficiently represents genotypic data by exploiting basic genetic properties and we directly query this compressed representation to calculate linkage disequilibrium for all pairwise alleles/genotypes in large-scale cohorts. In order to achieve speed, Tomahawk combines primarily two efficient algorithms exploiting different concepts: 1) low genetic diversity, and 2) the large memory registers on modern processors. The first algorithm directly compares run-length encoded representation of genotypes from two vectors. The other precomputes the run-length encodings as 1-bit encodings and use SIMD-instructions to directly compare two bit-vectors. This algorithm also exploits the relatively low genetic diversity within species. Both algorithms are embarrassingly parallel.
+Tomahawk efficiently compress genotypic data by exploiting intrinsic genetic properties and we describe algorithms to directly query, manipulate, and explore this jointly compressed representation in-place. We represent genotypic vectors as fixed-width run-length encoded (RLE) objects with the five highest bits encoding for phasing, allele A, allele B, and the remainder as the run-length. This encoding scheme is superior to dynamic-width encoding appro aches in terms of iteration speed but inferior in terms of compressibility. The word size (`uint8_t`, `uint16_t`, `uint32_t`, or `uint64_t`) of RLE entries is fixed across a file and is determined contextually contingent on the number of samples. Tomahawk has two primary internal functions:
-The current format specifications (v.0) for `TWK`,`TWI`,`TWO`,`TOI`, and `TGZF`
+1) iterate over sites and RLE entries;
+2) computing the inner product of compressed genotypic vectors;
+
+We describe efficient algorithms to calculate genome-wide linkage disequilibrium for all pairwise alleles/genotypes in large-scale cohorts. In order to achieve speed, Tomahawk primarily combines two efficient algorithms exploiting different concepts: 1) low genetic diversity and 2) the large memory registers on modern processors. The first algorithm directly compares RLE entries from two vectors. The other transforms RLE entries to bit-vectors and use SIMD-instructions to directly compare two such bit-vectors. This second algorithm also exploits the relatively low genetic diversity within species using implicit heuristics. Both algorithms are embarrassingly parallel.
+
+The current format specifications (v.0) for `TWK`,`TWO`, and `TGZF`
are available [TWKv0](spec/TWKv0.pdf)
-Marcus D. R. Klarqvist ()
+### Author
+Marcus D. R. Klarqvist ()
+Department of Genetics, University of Cambridge
+Wellcome Trust Sanger Institute
+
### Installation instructions
For modern x86-64 CPUs with `SSE4.2` or later, just type `make` in the `build`
directory. If you see compilation errors, you most likely do not have `SSE4.2`.
At the present time, we do not support non-x86 CPUs or old CPU architecture.
```bash
-git clone --recursive https://github.com/mklarqvist/Tomahawk
-cd Tomahawk
+git clone --recursive https://github.com/mklarqvist/tomahawk
+cd tomahawk
cd build
make
```
@@ -31,8 +40,6 @@ compiled target.
### Brief usage instructions
Tomahawk comprises five primary commands: `import`, `calc`, `view`, `sort`, and `concat`.
-The function `stats` have partial support: currently limited to basics for `two` files.
-The function `index` is disabled at the moment.
Executing `tomahawk` gives a list of commands with brief descriptions and `tomahawk `
gives detailed details for that command.
@@ -54,14 +61,11 @@ from Hardy-Weinberg equilibrium with a probability < 0.001
tomahawk import -i file.vcf -o outPrefix -m 0.2 -H 1e-3
```
-### Import-extend
-If you have split up your `vcf`/`bcf` files into multiple disjoint files
-(such as one per chromosome) it is possible to iteratively import and extend a `twk` file:
-```bash
-tomahawk import -i file.bcf -e extend.twk -m 0.2 -H 1e-3
-```
-
-### Calculating linkage disequilibrium
+### Calculating all-vs-all linkage disequilibrium
+In this example we force computations to use phased math (`-p`) and show a live progressbar
+(`-d`). Generated data is filtered for minimum genotype frequency (`-a`), squared correlation
+coefficient (`-r`) and by test statistics P-value (`-p`). Total computation is partitioned into 990 psuedo-balanced blocks (`-c`)
+and select the first partition (`-C`) to compute using 28 threads (`-t`);
```bash
tomahawk calc -pdi file.twk -o output_prefix -a 5 -r 0.1 -P 0.1 -c 990 -C 1 -t 28
```
@@ -98,5 +102,70 @@ Perform k-way merge of partially sorted blocks
tomahawk sort -i partial.two -o sorted.two -M
```
-### License
+## Plotting
+Plotting `two` data converted into `ld` format using the supplied `R` scripts (in the `R` directory).
+First transform a `two` file into human-readable `ld` format:
+```bash
+tomahawk view -hi 1kgp3_chr2_105_1.two > 1kgp3_chr2_105_1.ld
+```
+
+Either `source` the [R/example_region.R](R/example_region.R) file or copy-paste this code into `R`:
+```R
+# Specify colour scheme
+colors<-paste0(colorRampPalette(c("blue","red"))(10),seq(0,100,length.out = 11))
+colors[1]<-paste0(colors[1],"0")
+colors[length(colors)]<- substr(colors[length(colors)],1,7)
+
+# Define support functions
+plotLDRegion<-function(dataSource, from, to, ...){
+ # Assumes all the data is from the same chromosome
+ b<-dataSource[dataSource$V3>=from & dataSource$V3 <= to & dataSource$V5 >= from & dataSource$V5 <= to,]
+ b<-b[order(b$V13,decreasing = F),] # sort for Z-stack
+ plot(b$V3,b$V5,pch=20,cex=.2,col=colors[cut(b$V13,breaks=seq(0,1,length.out = 11),include.lowest = T)],xlim=c(from,to),ylim=c(from,to),xaxs="i",yaxs="i", ...)
+}
+
+plotLDRegionTriangular<-function(dataSource, from, to, ...){
+ # Assumes all the data is from the same chromosome
+ b<-dataSource[dataSource$V3>=from & dataSource$V5<=to & dataSource$V3>=from & dataSource$V5<=to,]
+ b<-b[b$V3blocks[this->selected_chunk];
- //std::cerr << Helpers::timestamp("DEBUG", "BALANCER") << this->selected_chunk << '/' << this->blocks.size() << std::endl;
-
- // attempt to merge
- // If there are both equal
- if(selected.fromRow == selected.fromColumn && selected.toRow == selected.toColumn){
- this->data_to_load.push_back(std::pair(selected.fromRow, selected.toRow));
- //std::cerr << "same: " << selected << std::endl;
- } else {
- // No voerlap
- //std::cerr << "Not same: " << selected << std::endl;
- this->data_to_load.push_back(std::pair(selected.fromRow, selected.toRow));
- this->data_to_load.push_back(std::pair(selected.fromColumn, selected.toColumn));
- }
-
- return true;
- }
-
- bool getSelectedLoadThreads(const U32 threads){
- const block_type& selected = this->blocks[selected_chunk];
- //std::cerr << Helpers::timestamp("DEBUG", "BALANCER") << "Thread balancing..." << std::endl;
-
- this->thread_distribution.resize(threads);
-
- if(threads == 1){
- this->thread_distribution[0].push_back(block_type(0, selected.getRows(), 0, selected.getColumns(), selected.fromRow, selected.toRow, selected.fromColumn, selected.toColumn, selected.isDiagonal()));
- return true;
- }
-
- //
- if(selected.isDiagonal()){
- if(!SILENT){
- std::cerr << Helpers::timestamp("LOG", "BALANCER") << "Case is diagonal (chunk " << this->selected_chunk << '/' << this->desired_chunks << ")..." << std::endl;
- std::cerr << Helpers::timestamp("LOG", "BALANCER") << "Total comparisons: " << Helpers::ToPrettyString(selected.getSize()) << " and per thread: " << Helpers::ToPrettyString(selected.getSize()/threads) << std::endl;
- }
-
- U32 loadThread = selected.getSize()/threads;
- U32 it = 0;
- U32 from = 0;
- U32 fromCol = 0;
- U32 threadID = 0;
-
- //
- for(U32 i = 0; i < selected.getRows(); ++i){
- for(U32 j = i; j < selected.getColumns(); ++j){
- ++it;
-
- // If number of comparions over threshold
- if(it >= loadThread){
- // if broken over a line
- // i.e. not broken on the same line number
- if(from == i){
- //std::cerr << "B\t" << threadID << ": " << from << '-' << i+1 << '\t' << fromCol << '-' << j << '\t' << selected.fromRow+from << '-' << selected.fromRow+(i+1) << '\t' << selected.fromColumn+fromCol << '-' << selected.toColumn+j << std::endl;
- this->thread_distribution[threadID].push_back(block_type(from, i+1, fromCol, j, selected.fromRow+from, selected.fromRow+i+1, selected.fromColumn+fromCol, selected.fromColumn+j));
- }
- // If broken over multiple lines
- else {
- if(threadID + 1 == threads){
- i = selected.getRows() - 1;
- j = selected.getColumns();
- }
-
- // If next line: no middle full lines
- if(from + 1 == i){
- //std::cerr << "N\t" << threadID << ": " << from << '-' << from+1 << '\t' << fromCol << '-' << selected.getColumns() << '\t' << "FALSE" << std::endl;
- //std::cerr << "N\t" << threadID << ": " << i << '-' << i+1 << '\t' << i << '-' << j << '\t' << "FALSE" << std::endl;
- this->thread_distribution[threadID].push_back(block_type(from, from+1, fromCol, selected.getColumns(), selected.fromRow+from, selected.fromRow+from+1, selected.fromColumn+fromCol, selected.toColumn));
- this->thread_distribution[threadID].push_back(block_type(i, i+1, i, j, selected.fromRow+i, selected.fromRow+i+1, selected.fromColumn+i, selected.fromColumn+j));
- fromCol = j;
- from = i;
- } else {
- //std::cerr << "E\t" << threadID << ": " << from << '-' << from + 1 << '\t' << fromCol << '-' << selected.getColumns() << '\t' << selected.fromRow+from << '-' << selected.fromRow+(from+1) << '\t' << selected.fromColumn+fromCol << '-' << selected.toColumn << std::endl;
- //std::cerr << "E\t" << threadID << ": " << from + 1 << '-' << i << '\t' << from + 1 << '-' << selected.getColumns() << '\t' << selected.fromRow+from+1 << '-' << selected.fromRow+(i) << '\t' << selected.fromColumn+from+1 << '-' << selected.toColumn << std::endl;
- //std::cerr << "E\t" << threadID << ": " << i << '-' << i + 1 << '\t' << i << '-' << j << '\t' << selected.fromRow+i << '-' << selected.fromRow+(i+1) << '\t' << selected.fromColumn+i << '-' << selected.fromColumn+j << std::endl;
- this->thread_distribution[threadID].push_back(block_type(from, from + 1, fromCol, selected.getColumns(), selected.fromRow+from, selected.fromRow+from+1, selected.fromColumn+fromCol, selected.toColumn));
- this->thread_distribution[threadID].push_back(block_type(from + 1, i, from + 1, selected.getColumns(), selected.fromRow+from+1, selected.fromRow+i, selected.fromColumn+from+1, selected.toColumn, true));
- this->thread_distribution[threadID].push_back(block_type(i, i + 1, i, j, selected.fromRow+i, selected.fromRow+i+1, selected.fromColumn+i, selected.fromColumn+j));
- }
- }
- it = 0;
- from = i;
- fromCol = j;
- ++threadID;
- }
-
- }
- }
- }
- // Is not a diagonal square
- else {
- if(!SILENT){
- std::cerr << Helpers::timestamp("LOG", "BALANCER") << "Case is square (chunk " << this->selected_chunk << '/' << this->desired_chunks << ")..." << std::endl;
- std::cerr << Helpers::timestamp("LOG", "BALANCER") << "Total comparisons: " << Helpers::ToPrettyString(selected.getSize()) << " and per thread: " << Helpers::ToPrettyString(selected.getSize()/threads) << std::endl;
- }
-
- U32 loadThread = selected.getSize()/threads;
- U32 it = 0;
- U32 from = 0;
- U32 fromCol = selected.getRows();
- U32 threadID = 0;
-
- //
- for(U32 i = 0; i < selected.getRows(); ++i){
- for(U32 j = selected.getRows(); j < 2*selected.getRows(); ++j){
- ++it;
-
- // If number of comparions over threshold
- if(it >= loadThread){
- // if broken over a line
- // i.e. not broken on the same line number
- if(from == i){
- //std::cerr << threadID << ": " << from << '-' << i+1 << '\t' << fromCol << '-' << j << '\t' << "FALSE" << std::endl;
- this->thread_distribution[threadID].push_back(block_type(from, i+1, fromCol, j, selected.fromRow+from, selected.fromRow+i+1, selected.fromColumn+fromCol, selected.fromColumn+j));
- }
- // If broken over multiple lines
- else {
- if(threadID + 1 == threads){
- i = selected.getRows() - 1;
- j = 2*selected.getRows();
- }
-
- // If next line: no middle full lines
- if(from + 1 == i){
- //std::cerr << threadID << ": " << from << '-' << from+1 << '\t' << fromCol << '-' << 2*selected.getRows() << '\t' << "FALSE" << std::endl;
- //std::cerr << threadID << ": " << i << '-' << i+1 << '\t' << selected.getRows() << '-' << j << '\t' << "FALSE" << std::endl;
- this->thread_distribution[threadID].push_back(block_type(from, from+1, fromCol, 2*selected.getRows(), selected.fromRow+from, selected.fromRow+from+1, selected.fromColumn+fromCol, selected.fromColumn+2*selected.getRows()));
- this->thread_distribution[threadID].push_back(block_type(i, i+1, 2*selected.getRows(), j, selected.fromRow+i, selected.fromRow+i+1, selected.fromColumn+2*selected.getRows(), selected.fromColumn+j));
- fromCol = j;
- from = i;
- } else {
- //std::cerr << threadID << ": " << from << '-' << from + 1 << '\t' << fromCol << '-' << 2*selected.getRows() << '\t' << "FALSE" << std::endl;
- //std::cerr << threadID << ": " << from + 1 << '-' << i << '\t' << selected.getRows() << '-' << 2*selected.getRows() << '\t' << "FALSE" << std::endl;
- //std::cerr << threadID << ": " << i << '-' << i + 1 << '\t' << selected.getRows() << '-' << j << '\t' << "FALSE" << std::endl;
- this->thread_distribution[threadID].push_back(block_type(from, from + 1, fromCol, 2*selected.getRows(), selected.fromRow+from, selected.fromRow+from+1, selected.fromColumn+fromCol, selected.fromColumn+2*selected.getRows()));
- this->thread_distribution[threadID].push_back(block_type(from + 1, i, selected.getRows(), 2*selected.getRows(), selected.fromRow+from+1, selected.fromRow+i, selected.fromColumn+selected.getRows(), selected.fromColumn+2*selected.getRows()));
- this->thread_distribution[threadID].push_back(block_type(i, i + 1, selected.getRows(), j, selected.fromRow+i, selected.fromRow+i+1, selected.fromColumn+selected.getRows(), selected.fromColumn+j));
- }
- }
- it = 0;
- from = i;
- fromCol = j;
- ++threadID;
- }
-
- }
- }
- }
-
- // assertion
-
-
- //std::cerr << "DEBUG" << std::endl;
- //for(U32 i = 0; i < this->thread_distribution.size(); ++i)
- // std::cerr << i << '\t' << this->thread_distribution[i].size() << std::endl;
- //std::cerr << "Has: " << this->thread_distribution.size() << " thread blocks" << std::endl;
-
- return true;
- }
-
- bool setSelected(const S32 selected){
- if(selected < 0){
- std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Cannot set select a negative chunk..." << std::endl;
- return false;
- }
-
- this->selected_chunk = selected;
- return true;
- }
-
- bool setDesired(const S32 desired){
- if(desired < 0){
- std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Cannot cut workload into a negative number of blocks..." << std::endl;
- return false;
- }
-
- this->desired_chunks = desired;
- return true;
- }
-
- bool Build(const U32 total_blocks, const U32 threads){
- if(this->selected_chunk > this->desired_chunks){
- std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Incorrectly selected block (" << this->selected_chunk << '/' << this->desired_chunks << ")..." << std::endl;
- return false;
- }
-
- // If selecting > 1 chunk
- if(this->desired_chunks != 1){
- U32 cutSize = 1;
- //std::vector backup_cuts;
- for(U32 i = 1; i < total_blocks; ++i){
-
- if((i*i - i) / 2 == this->desired_chunks)
- cutSize = i;
-
- }
-
- if(cutSize == 1){
- std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Cannot cut into " << this->desired_chunks << " chunks" << std::endl;
- return(false);
- }
-
- U32 total = 0; // Sanity
- //std::cerr << "cut-size is: " << cutSize << std::endl;
- const U32 rowLength = total_blocks / cutSize;
- for(U32 i = 0; i < cutSize-1; ++i){
- //std::cerr << i << '/' << cutSize-1 << std::endl;
- U32 j = i;
- U32 fromX = i*rowLength;
- U32 toX = (i+1)*rowLength;
- if(i + 1 == cutSize - 1)
- toX = total_blocks;
-
- for(; j < cutSize-1; ++j){
- U32 fromY = j*rowLength;
- U32 toY = (j+1)*rowLength;
-
-
- if(j + 1 == cutSize - 1)
- toY = total_blocks;
-
- //std::cerr << "(" << i << ',' << j << ")\t" << fromX << '-' << toX << '\t' << fromY << '-' << toY << std::endl;
- this->blocks.push_back(block_type(fromX, toX, fromY, toY));
- ++total;
- }
- }
-
- //std::cerr << "Total: " << total << '/' << this->desired_chunks << std::endl;
- if(total != this->desired_chunks){
- std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Corrupted balancing..." << std::endl;
- return(false);
- }
-
- } else {
- // All blocks
- this->blocks.push_back(block_type(0, total_blocks, 0, total_blocks));
- }
-
- // What data do we load?
- this->getSelectedLoad();
-
- // Divide data into threads
- if(!this->getSelectedLoadThreads(threads))
- return false;
-
- return true;
- }
-
- inline std::vector< std::pair >& getLoad(void){ return(this->data_to_load); }
-
-public:
- U32 selected_chunk;
- U32 desired_chunks;
-
- std::vector blocks;
- std::vector< std::pair > data_to_load;
- std::vector< std::vector > thread_distribution;
-};
-
-}
-#endif /* ALGORITHM_BALANCER_H_ */
diff --git a/src/algorithm/compression/RunLengthEncoding.h b/src/algorithm/compression/RunLengthEncoding.h
deleted file mode 100644
index 3331baf..0000000
--- a/src/algorithm/compression/RunLengthEncoding.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef ALGORITHM_COMPRESSION_RUNLENGTHENCODING_H_
-#define ALGORITHM_COMPRESSION_RUNLENGTHENCODING_H_
-
-namespace Tomahawk{
-namespace Algorithm{
-
-template
-inline int PACK3(const BYTE& ref, char* target, T length){
- if(length <= 7){
- *target++ = 0 | ((ref & 15) << 3) | (length & 7); // highest bit is 0
- return 1;
- }
-
- char* target0 = target;
- *target++ = 128 | ((ref & 15) << 3) | (length & 7);
- length >>= 3;
-
- while(true){
- if(length <= 7){
- *target++ = 0 | (length & 127); // highest bit is 0
- length >>= 7;
- break;
- }
-
- *target++ = 128 | (length & 127);
- length >>= 7;
- }
- return(target - target0);
-}
-
-template
-inline int UNPACK3(char* target, T length, BYTE& ref){
- if((*target & 128) == 0){
- length = *target & 7;
- ref = *target >> 3;
- return 1;
- }
-
- char* target0 = target;
- length = *target & 7;
- ref = (*target >> 3) & 15;
- ++target;
- U32 offset = 3;
-
- while(true){
- length |= (*target & 127) << offset;
- offset += 7;
-
- if((*target & 128) == 0) break;
- ++target;
- }
-
- return(target - target0);
-}
-
-}
-}
-
-#endif /* ALGORITHM_COMPRESSION_RUNLENGTHENCODING_H_ */
diff --git a/src/algorithm/compression/TomahawkImportRLE.h b/src/algorithm/compression/genotype_encoder.h
similarity index 92%
rename from src/algorithm/compression/TomahawkImportRLE.h
rename to src/algorithm/compression/genotype_encoder.h
index 66a39c4..5807295 100644
--- a/src/algorithm/compression/TomahawkImportRLE.h
+++ b/src/algorithm/compression/genotype_encoder.h
@@ -4,9 +4,9 @@
#include
#include
-#include "../../math/FisherMath.h"
+#include "../../math/fisher_math.h"
#include "../../io/bcf/BCFReader.h"
-#include "RunLengthEncoding.h"
+#include "../../io/vcf/VCFLines.h"
namespace Tomahawk{
namespace Algorithm{
@@ -181,26 +181,26 @@ struct TomahawkImportRLEHelper{
inline const U64 countAlleles(void) const{ return(this->countsAlleles[0] + this->countsAlleles[1] + this->countsAlleles[2]); }
- U64 countsGenotypes[16];
- U64 countsAlleles[3];
+ U64 countsGenotypes[16];
+ U64 countsAlleles[3];
float MAF;
float MGF;
float HWE_P;
- bool missingValues;
- bool phased;
- const U64 expectedSamples;
+ bool missingValues;
+ bool phased;
+ const U64 expectedSamples;
FisherMath fisherTable;
};
-class TomahawkImportRLE {
- typedef TomahawkImportRLE self_type;
- typedef bool (Tomahawk::Algorithm::TomahawkImportRLE::*rleFunction)(const VCF::VCFLine& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs); // Type cast pointer to function
- typedef bool (Tomahawk::Algorithm::TomahawkImportRLE::*bcfFunction)(const BCF::BCFEntry& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs); // Type cast pointer to function
+class GenotypeEncoder {
+ typedef GenotypeEncoder self_type;
+ typedef bool (self_type::*rleFunction)(const VCF::VCFLine& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs); // Type cast pointer to function
+ typedef bool (self_type::*bcfFunction)(const BCF::BCFEntry& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs); // Type cast pointer to function
typedef TomahawkImportRLEHelper helper_type;
public:
- TomahawkImportRLE(const U64 samples) :
+ GenotypeEncoder(const U64 samples) :
n_samples(samples),
encode(nullptr),
encodeComplex(nullptr),
@@ -212,7 +212,7 @@ class TomahawkImportRLE {
{
}
- ~TomahawkImportRLE(){
+ ~GenotypeEncoder(){
}
void DetermineBitWidth(void){
@@ -284,12 +284,12 @@ class TomahawkImportRLE {
template bool RunLengthEncodeBCF(const BCF::BCFEntry& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs);
private:
- U64 n_samples;
- rleFunction encode; // encoding function
- rleFunction encodeComplex; // encoding function
+ U64 n_samples;
+ rleFunction encode; // encoding function
+ rleFunction encodeComplex; // encoding function
bcfFunction encodeBCF;
- BYTE bit_width;
- BYTE shiftSize; // bit shift size
+ BYTE bit_width;
+ BYTE shiftSize; // bit shift size
helper_type helper;
public:
@@ -297,7 +297,7 @@ class TomahawkImportRLE {
};
template
-bool TomahawkImportRLE::RunLengthEncodeBCF(const BCF::BCFEntry& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs){
+bool GenotypeEncoder::RunLengthEncodeBCF(const BCF::BCFEntry& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs){
//std::cerr << meta.size() << '\t' << runs.size();
meta += (U32)line.body->POS + 1;
@@ -360,12 +360,13 @@ bool TomahawkImportRLE::RunLengthEncodeBCF(const BCF::BCFEntry& line, IO::BasicB
this->helper.calculateHardyWeinberg();
// Position
- U32& position = *reinterpret_cast(&meta[meta.pointer - 5]);
+ U32& position = *reinterpret_cast(&meta[meta.size() - 5]);
position <<= 2;
position |= this->helper.phased << 1;
position |= this->helper.missingValues << 0;
meta += this->helper.MGF;
meta += this->helper.HWE_P;
+ //std::cerr << this->helper.MGF << std::endl;
//n_runs = runs.pointer - runs_pointer_begin; // temp
meta += n_runs;
@@ -374,16 +375,16 @@ bool TomahawkImportRLE::RunLengthEncodeBCF(const BCF::BCFEntry& line, IO::BasicB
}
template
-bool TomahawkImportRLE::RunLengthEncodeSimple(const VCF::VCFLine& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs){
+bool GenotypeEncoder::RunLengthEncodeSimple(const VCF::VCFLine& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs){
meta += line.position;
meta += line.ref_alt;
- ///////////////////////////////
+ /*//////////////////////////////
// Encoding:
// First 8|T| - TOMAHAWK_SNP_PACK_WIDTH bits encode the run length
// remaining TOMAHAWK_SNP_PACK_WIDTH bits encode
// TOMAHAWK_ALLELE_PACK_WIDTH bits of snpA and TOMAHAWK_ALLELE_PACK_WIDTH bits of snpB
- ///////////////////////////////
+ //////////////////////////////*/
T run_length = 1;
// ASCII value for '.' is 46
@@ -466,7 +467,7 @@ bool TomahawkImportRLE::RunLengthEncodeSimple(const VCF::VCFLine& line, IO::Basi
this->helper.calculateHardyWeinberg();
// Position
- U32& position = *reinterpret_cast(&meta[meta.pointer - 5]);
+ U32& position = *reinterpret_cast(&meta[meta.size() - 5]);
position <<= 2;
position |= this->helper.phased << 1;
position |= this->helper.missingValues << 0;
@@ -480,16 +481,16 @@ bool TomahawkImportRLE::RunLengthEncodeSimple(const VCF::VCFLine& line, IO::Basi
template
-bool TomahawkImportRLE::RunLengthEncodeComplex(const VCF::VCFLine& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs){
+bool GenotypeEncoder::RunLengthEncodeComplex(const VCF::VCFLine& line, IO::BasicBuffer& meta, IO::BasicBuffer& runs){
meta += line.position;
meta += line.ref_alt;
- ///////////////////////////////
+ /*//////////////////////////////
// Encoding:
// First 8|T| - TOMAHAWK_SNP_PACK_WIDTH bits encode the run length
// remaining TOMAHAWK_SNP_PACK_WIDTH bits encode
// TOMAHAWK_ALLELE_PACK_WIDTH bits of snpA and TOMAHAWK_ALLELE_PACK_WIDTH bits of snpB
- ///////////////////////////////
+ //////////////////////////////*/
T run_length = 1;
// ASCII value for '.' is 46
@@ -570,7 +571,7 @@ bool TomahawkImportRLE::RunLengthEncodeComplex(const VCF::VCFLine& line, IO::Bas
this->helper.calculateHardyWeinberg();
// Position
- U32& position = *reinterpret_cast(&meta[meta.pointer - 5]);
+ U32& position = *reinterpret_cast(&meta[meta.size() - 5]);
position <<= 2;
position |= this->helper.phased << 1;
position |= this->helper.missingValues << 0;
diff --git a/src/algorithm/GenotypeBitPacker.h b/src/algorithm/genotype_bitpacker.h
similarity index 95%
rename from src/algorithm/GenotypeBitPacker.h
rename to src/algorithm/genotype_bitpacker.h
index 1fc83de..bb69111 100644
--- a/src/algorithm/GenotypeBitPacker.h
+++ b/src/algorithm/genotype_bitpacker.h
@@ -1,5 +1,5 @@
-#ifndef ALGORITHM_GENOTYPEBITPACKER_H_
-#define ALGORITHM_GENOTYPEBITPACKER_H_
+#ifndef ALGORITHM_GENOTYPE_BITPACKER_H_
+#define ALGORITHM_GENOTYPE_BITPACKER_H_
namespace Tomahawk{
namespace Algorithm{
@@ -86,4 +86,4 @@ class GenotypeBitPacker{
}
}
-#endif /* ALGORITHM_GENOTYPEBITPACKER_H_ */
+#endif /* ALGORITHM_GENOTYPE_BITPACKER_H_ */
diff --git a/src/algorithm/LoadBalancerBlock.h b/src/algorithm/load_balancer_block.h
similarity index 81%
rename from src/algorithm/LoadBalancerBlock.h
rename to src/algorithm/load_balancer_block.h
index a01daeb..d6f45f3 100644
--- a/src/algorithm/LoadBalancerBlock.h
+++ b/src/algorithm/load_balancer_block.h
@@ -1,8 +1,8 @@
-#ifndef ALGORITHM_LOADBALANCERBLOCK_H_
-#define ALGORITHM_LOADBALANCERBLOCK_H_
+#ifndef ALGORITHM_LOAD_BALANCER_BLOCK_H_
+#define ALGORITHM_LOAD_BALANCER_BLOCK_H_
#include
-#include "../support/TypeDefinitions.h"
+#include "../support/type_definitions.h"
namespace Tomahawk{
@@ -21,15 +21,15 @@ struct LoadBalancerBlock{
{}
LoadBalancerBlock(const U32 fromRow, const U32 toRow, const U32 fromColumn, const U32 toColumn, const U32 fromRowAbsolute, const U32 toRowAbsolute, const U32 fromColumnAbsolute, const U32 toColumnAbsolute, bool stagger = false) :
- fromRow(fromRow),
- toRow(toRow),
- fromColumn(fromColumn),
- toColumn(toColumn),
- staggered(stagger),
- fromRowAbsolute(fromRowAbsolute),
- toRowAbsolute(toRowAbsolute),
- fromColumnAbsolute(fromColumnAbsolute),
- toColumnAbsolute(toColumnAbsolute)
+ fromRow(fromRow),
+ toRow(toRow),
+ fromColumn(fromColumn),
+ toColumn(toColumn),
+ staggered(stagger),
+ fromRowAbsolute(fromRowAbsolute),
+ toRowAbsolute(toRowAbsolute),
+ fromColumnAbsolute(fromColumnAbsolute),
+ toColumnAbsolute(toColumnAbsolute)
{}
~LoadBalancerBlock(){}
@@ -44,10 +44,10 @@ struct LoadBalancerBlock{
}
inline LoadBalancerBlock& operator()(const U32 fromRow, const U32 toRow, const U32 fromColumn, const U32 toColumn, const bool diagonal){
- this->fromRow = fromRow;
- this->toRow = toRow;
+ this->fromRow = fromRow;
+ this->toRow = toRow;
this->fromColumn = fromColumn;
- this->toColumn= toColumn;
+ this->toColumn = toColumn;
return(*this);
}
@@ -80,4 +80,4 @@ struct LoadBalancerBlock{
}
-#endif /* ALGORITHM_LOADBALANCERBLOCK_H_ */
+#endif /* ALGORITHM_LOAD_BALANCER_BLOCK_H_ */
diff --git a/src/algorithm/load_balancer_ld.cpp b/src/algorithm/load_balancer_ld.cpp
new file mode 100644
index 0000000..d5a09af
--- /dev/null
+++ b/src/algorithm/load_balancer_ld.cpp
@@ -0,0 +1,257 @@
+#include "load_balancer_ld.h"
+
+namespace Tomahawk{
+
+LoadBalancerLD::LoadBalancerLD() : selected_chunk(0), desired_chunks(1){}
+LoadBalancerLD::~LoadBalancerLD(){}
+
+bool LoadBalancerLD::getSelectedLoad(){
+ //std::cerr << Helpers::timestamp("DEBUG", "BALANCER") << "What data to load?" << std::endl;
+ value_type selected = this->blocks[this->selected_chunk];
+ //std::cerr << Helpers::timestamp("DEBUG", "BALANCER") << this->selected_chunk << '/' << this->blocks.size() << std::endl;
+
+ // attempt to merge
+ // If there are both equal
+ if(selected.fromRow == selected.fromColumn && selected.toRow == selected.toColumn){
+ this->data_to_load.push_back(std::pair(selected.fromRow, selected.toRow));
+ //std::cerr << "same: " << selected << std::endl;
+ } else {
+ // No voerlap
+ //std::cerr << "Not same: " << selected << std::endl;
+ this->data_to_load.push_back(std::pair(selected.fromRow, selected.toRow));
+ this->data_to_load.push_back(std::pair(selected.fromColumn, selected.toColumn));
+ }
+
+ return true;
+}
+
+bool LoadBalancerLD::getSelectedLoadThreads(const U32 threads){
+ const value_type& selected = this->blocks[selected_chunk];
+ //std::cerr << Helpers::timestamp("DEBUG", "BALANCER") << "Thread balancing..." << std::endl;
+
+ this->thread_distribution.resize(threads);
+
+ if(threads == 1){
+ this->thread_distribution[0].push_back(value_type(0, selected.getRows(), 0, selected.getColumns(), selected.fromRow, selected.toRow, selected.fromColumn, selected.toColumn, selected.isDiagonal()));
+ return true;
+ }
+
+ //
+ if(selected.isDiagonal()){
+ if(!SILENT){
+ std::cerr << Helpers::timestamp("LOG", "BALANCER") << "Case is diagonal (chunk " << this->selected_chunk << '/' << this->desired_chunks << ")..." << std::endl;
+ std::cerr << Helpers::timestamp("LOG", "BALANCER") << "Total comparisons: " << Helpers::ToPrettyString(selected.getSize()) << " and per thread: " << Helpers::ToPrettyString(selected.getSize()/threads) << std::endl;
+ }
+
+ U32 loadThread = selected.getSize()/threads;
+ U32 it = 0;
+ U32 from = 0;
+ U32 fromCol = 0;
+ U32 threadID = 0;
+
+ //
+ for(U32 i = 0; i < selected.getRows(); ++i){
+ for(U32 j = i; j < selected.getColumns(); ++j){
+ ++it;
+
+ // If number of comparions over threshold
+ if(it >= loadThread){
+ // if broken over a line
+ // i.e. not broken on the same line number
+ if(from == i){
+ //std::cerr << "B\t" << threadID << ": " << from << '-' << i+1 << '\t' << fromCol << '-' << j << '\t' << selected.fromRow+from << '-' << selected.fromRow+(i+1) << '\t' << selected.fromColumn+fromCol << '-' << selected.toColumn+j << std::endl;
+ this->thread_distribution[threadID].push_back(value_type(from, i+1, fromCol, j, selected.fromRow+from, selected.fromRow+i+1, selected.fromColumn+fromCol, selected.fromColumn+j));
+ }
+ // If broken over multiple lines
+ else {
+ if(threadID + 1 == threads){
+ i = selected.getRows() - 1;
+ j = selected.getColumns();
+ }
+
+ // If next line: no middle full lines
+ if(from + 1 == i){
+ //std::cerr << "N\t" << threadID << ": " << from << '-' << from+1 << '\t' << fromCol << '-' << selected.getColumns() << '\t' << "FALSE" << std::endl;
+ //std::cerr << "N\t" << threadID << ": " << i << '-' << i+1 << '\t' << i << '-' << j << '\t' << "FALSE" << std::endl;
+ this->thread_distribution[threadID].push_back(value_type(from, from+1, fromCol, selected.getColumns(), selected.fromRow+from, selected.fromRow+from+1, selected.fromColumn+fromCol, selected.toColumn));
+ this->thread_distribution[threadID].push_back(value_type(i, i+1, i, j, selected.fromRow+i, selected.fromRow+i+1, selected.fromColumn+i, selected.fromColumn+j));
+ fromCol = j;
+ from = i;
+ } else {
+ //std::cerr << "E\t" << threadID << ": " << from << '-' << from + 1 << '\t' << fromCol << '-' << selected.getColumns() << '\t' << selected.fromRow+from << '-' << selected.fromRow+(from+1) << '\t' << selected.fromColumn+fromCol << '-' << selected.toColumn << std::endl;
+ //std::cerr << "E\t" << threadID << ": " << from + 1 << '-' << i << '\t' << from + 1 << '-' << selected.getColumns() << '\t' << selected.fromRow+from+1 << '-' << selected.fromRow+(i) << '\t' << selected.fromColumn+from+1 << '-' << selected.toColumn << std::endl;
+ //std::cerr << "E\t" << threadID << ": " << i << '-' << i + 1 << '\t' << i << '-' << j << '\t' << selected.fromRow+i << '-' << selected.fromRow+(i+1) << '\t' << selected.fromColumn+i << '-' << selected.fromColumn+j << std::endl;
+ this->thread_distribution[threadID].push_back(value_type(from, from + 1, fromCol, selected.getColumns(), selected.fromRow+from, selected.fromRow+from+1, selected.fromColumn+fromCol, selected.toColumn));
+ this->thread_distribution[threadID].push_back(value_type(from + 1, i, from + 1, selected.getColumns(), selected.fromRow+from+1, selected.fromRow+i, selected.fromColumn+from+1, selected.toColumn, true));
+ this->thread_distribution[threadID].push_back(value_type(i, i + 1, i, j, selected.fromRow+i, selected.fromRow+i+1, selected.fromColumn+i, selected.fromColumn+j));
+ }
+ }
+ it = 0;
+ from = i;
+ fromCol = j;
+ ++threadID;
+ }
+
+ }
+ }
+ }
+ // Is not a diagonal square
+ else {
+ if(!SILENT){
+ std::cerr << Helpers::timestamp("LOG", "BALANCER") << "Case is square (chunk " << this->selected_chunk << '/' << this->desired_chunks << ")..." << std::endl;
+ std::cerr << Helpers::timestamp("LOG", "BALANCER") << "Total comparisons: " << Helpers::ToPrettyString(selected.getSize()) << " and per thread: " << Helpers::ToPrettyString(selected.getSize()/threads) << std::endl;
+ }
+
+ U32 loadThread = selected.getSize()/threads;
+ U32 it = 0;
+ U32 from = 0;
+ U32 fromCol = selected.getRows();
+ U32 threadID = 0;
+
+ //
+ for(U32 i = 0; i < selected.getRows(); ++i){
+ for(U32 j = selected.getRows(); j < 2*selected.getRows(); ++j){
+ ++it;
+
+ // If number of comparions over threshold
+ if(it >= loadThread){
+ // if broken over a line
+ // i.e. not broken on the same line number
+ if(from == i){
+ //std::cerr << threadID << ": " << from << '-' << i+1 << '\t' << fromCol << '-' << j << '\t' << "FALSE" << std::endl;
+ this->thread_distribution[threadID].push_back(value_type(from, i+1, fromCol, j, selected.fromRow+from, selected.fromRow+i+1, selected.fromColumn+fromCol, selected.fromColumn+j));
+ }
+ // If broken over multiple lines
+ else {
+ if(threadID + 1 == threads){
+ i = selected.getRows() - 1;
+ j = 2*selected.getRows();
+ }
+
+ // If next line: no middle full lines
+ if(from + 1 == i){
+ //std::cerr << threadID << ": " << from << '-' << from+1 << '\t' << fromCol << '-' << 2*selected.getRows() << '\t' << "FALSE" << std::endl;
+ //std::cerr << threadID << ": " << i << '-' << i+1 << '\t' << selected.getRows() << '-' << j << '\t' << "FALSE" << std::endl;
+ this->thread_distribution[threadID].push_back(value_type(from, from+1, fromCol, 2*selected.getRows(), selected.fromRow+from, selected.fromRow+from+1, selected.fromColumn+fromCol, selected.fromColumn+2*selected.getRows()));
+ this->thread_distribution[threadID].push_back(value_type(i, i+1, 2*selected.getRows(), j, selected.fromRow+i, selected.fromRow+i+1, selected.fromColumn+2*selected.getRows(), selected.fromColumn+j));
+ fromCol = j;
+ from = i;
+ } else {
+ //std::cerr << threadID << ": " << from << '-' << from + 1 << '\t' << fromCol << '-' << 2*selected.getRows() << '\t' << "FALSE" << std::endl;
+ //std::cerr << threadID << ": " << from + 1 << '-' << i << '\t' << selected.getRows() << '-' << 2*selected.getRows() << '\t' << "FALSE" << std::endl;
+ //std::cerr << threadID << ": " << i << '-' << i + 1 << '\t' << selected.getRows() << '-' << j << '\t' << "FALSE" << std::endl;
+ this->thread_distribution[threadID].push_back(value_type(from, from + 1, fromCol, 2*selected.getRows(), selected.fromRow+from, selected.fromRow+from+1, selected.fromColumn+fromCol, selected.fromColumn+2*selected.getRows()));
+ this->thread_distribution[threadID].push_back(value_type(from + 1, i, selected.getRows(), 2*selected.getRows(), selected.fromRow+from+1, selected.fromRow+i, selected.fromColumn+selected.getRows(), selected.fromColumn+2*selected.getRows()));
+ this->thread_distribution[threadID].push_back(value_type(i, i + 1, selected.getRows(), j, selected.fromRow+i, selected.fromRow+i+1, selected.fromColumn+selected.getRows(), selected.fromColumn+j));
+ }
+ }
+ it = 0;
+ from = i;
+ fromCol = j;
+ ++threadID;
+ }
+
+ }
+ }
+ }
+
+ // assertion
+
+
+ //std::cerr << "DEBUG" << std::endl;
+ //for(U32 i = 0; i < this->thread_distribution.size(); ++i)
+ // std::cerr << i << '\t' << this->thread_distribution[i].size() << std::endl;
+ //std::cerr << "Has: " << this->thread_distribution.size() << " thread blocks" << std::endl;
+
+ return true;
+}
+
+bool LoadBalancerLD::setSelected(const S32 selected){
+ if(selected < 0){
+ std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Cannot set select a negative chunk..." << std::endl;
+ return false;
+ }
+
+ this->selected_chunk = selected;
+ return true;
+}
+
+bool LoadBalancerLD::setDesired(const S32 desired){
+ if(desired < 0){
+ std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Cannot cut workload into a negative number of blocks..." << std::endl;
+ return false;
+ }
+
+ this->desired_chunks = desired;
+ return true;
+}
+
+bool LoadBalancerLD::Build(const U32 total_blocks, const U32 threads){
+ if(this->selected_chunk > this->desired_chunks){
+ std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Incorrectly selected block (" << this->selected_chunk << '/' << this->desired_chunks << ")..." << std::endl;
+ return false;
+ }
+
+ // If selecting > 1 chunk
+ if(this->desired_chunks != 1){
+ U32 cutSize = 1;
+ //std::vector backup_cuts;
+ for(U32 i = 1; i < total_blocks; ++i){
+
+ if((i*i - i) / 2 == this->desired_chunks)
+ cutSize = i;
+
+ }
+
+ if(cutSize == 1){
+ std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Cannot cut into " << this->desired_chunks << " chunks" << std::endl;
+ return(false);
+ }
+
+ U32 total = 0; // Sanity
+ //std::cerr << "cut-size is: " << cutSize << std::endl;
+ const U32 rowLength = total_blocks / cutSize;
+ for(U32 i = 0; i < cutSize-1; ++i){
+ //std::cerr << i << '/' << cutSize-1 << std::endl;
+ U32 j = i;
+ U32 fromX = i*rowLength;
+ U32 toX = (i+1)*rowLength;
+ if(i + 1 == cutSize - 1)
+ toX = total_blocks;
+
+ for(; j < cutSize-1; ++j){
+ U32 fromY = j*rowLength;
+ U32 toY = (j+1)*rowLength;
+
+
+ if(j + 1 == cutSize - 1)
+ toY = total_blocks;
+
+ //std::cerr << "(" << i << ',' << j << ")\t" << fromX << '-' << toX << '\t' << fromY << '-' << toY << std::endl;
+ this->blocks.push_back(value_type(fromX, toX, fromY, toY));
+ ++total;
+ }
+ }
+
+ //std::cerr << "Total: " << total << '/' << this->desired_chunks << std::endl;
+ if(total != this->desired_chunks){
+ std::cerr << Helpers::timestamp("ERROR", "BALANCER") << "Corrupted balancing..." << std::endl;
+ return(false);
+ }
+
+ } else {
+ // All blocks
+ this->blocks.push_back(value_type(0, total_blocks, 0, total_blocks));
+ }
+
+ // What data do we load?
+ this->getSelectedLoad();
+
+ // Divide data into threads
+ if(!this->getSelectedLoadThreads(threads))
+ return false;
+
+ return true;
+}
+
+}
diff --git a/src/algorithm/load_balancer_ld.h b/src/algorithm/load_balancer_ld.h
new file mode 100644
index 0000000..b316caf
--- /dev/null
+++ b/src/algorithm/load_balancer_ld.h
@@ -0,0 +1,42 @@
+#ifndef ALGORITHM_LOAD_BALANCER_LD_H_
+#define ALGORITHM_LOAD_BALANCER_LD_H_
+
+#include "../support/MagicConstants.h"
+#include "../support/helpers.h"
+#include "load_balancer_block.h"
+
+namespace Tomahawk{
+
+class LoadBalancerLD{
+private:
+ typedef LoadBalancerLD self_type;
+ typedef LoadBalancerBlock value_type;
+ typedef value_type& reference;
+ typedef const value_type& const_reference;
+ typedef value_type* pointer;
+ typedef const value_type* const_pointer;
+ typedef std::ptrdiff_t difference_type;
+ typedef std::size_t size_type;
+
+public:
+ LoadBalancerLD();
+ ~LoadBalancerLD();
+
+ bool getSelectedLoad();
+ bool getSelectedLoadThreads(const U32 threads);
+ bool setSelected(const S32 selected);
+ bool setDesired(const S32 desired);
+ bool Build(const U32 total_blocks, const U32 threads);
+ inline std::vector< std::pair >& getLoad(void){ return(this->data_to_load); }
+
+public:
+ U32 selected_chunk;
+ U32 desired_chunks;
+
+ std::vector blocks;
+ std::vector< std::pair > data_to_load;
+ std::vector< std::vector > thread_distribution;
+};
+
+}
+#endif /* ALGORITHM_LOAD_BALANCER_LD_H_ */
diff --git a/src/algorithm/OpenHashTable.h b/src/algorithm/open_hashtable.h
similarity index 97%
rename from src/algorithm/OpenHashTable.h
rename to src/algorithm/open_hashtable.h
index 4cbd86a..ec337cd 100755
--- a/src/algorithm/OpenHashTable.h
+++ b/src/algorithm/open_hashtable.h
@@ -7,7 +7,7 @@
#include
#include
-#include "../support/TypeDefinitions.h"
+#include "../support/type_definitions.h"
#include "../third_party/xxhash/xxhash.h"
namespace Tomahawk {
@@ -38,8 +38,8 @@ class HashTable{
bool GetItem(const T* key, K*& entry, U32 length = sizeof(T));
bool GetItem(const void* key_address, const T* key, K*& entry, U32 length = sizeof(T));
void clear();
- U32 size(void) const{return this->__size;}
- U32 occupied(void) const{return this->__occupied;}
+ inline const U32& size(void) const{return this->__size;}
+ inline const U32& occupied(void) const{return this->__occupied;}
Entry& operator[](const U32 position){return *this->__entries[position];}
K& at(const U32 position){return this->__entries.at(position);}
diff --git a/src/algorithm/sort/TomahawkOutputSort.cpp b/src/algorithm/sort/TomahawkOutputSort.cpp
deleted file mode 100644
index 377950e..0000000
--- a/src/algorithm/sort/TomahawkOutputSort.cpp
+++ /dev/null
@@ -1,385 +0,0 @@
-#include
-
-#include "TomahawkOutputSort.h"
-
-namespace Tomahawk{
-namespace Algorithm{
-namespace Output{
-
-bool TomahawkOutputSorter::sort(const std::string& input, const std::string& destinationPrefix, U64 memory_limit){
- if(!this->reader.Open(input)){
- std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << input << "..." << std::endl;
- return false;
- }
-
- //
- std::vector paths = Helpers::filePathBaseExtension(destinationPrefix);
- this->basePath = paths[0];
- if(this->basePath.size() > 0)
- this->basePath += '/';
-
- if(paths[3].size() == Tomahawk::Constants::OUTPUT_LD_SUFFIX.size() &&
- strncasecmp(&paths[3][0], &Tomahawk::Constants::OUTPUT_LD_SUFFIX[0], Tomahawk::Constants::OUTPUT_LD_SUFFIX.size()) == 0)
- this-> baseName = paths[2];
- else this->baseName = paths[1];
-
- // Writing
- this->reader.setWriterType(0);
- this->reader.addLiteral("\n##tomahawk_partialSortCommand=" + Helpers::program_string());
- this->reader.OpenWriter(basePath + baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX);
-
- basic_writer_type toi_writer;
- toi_writer.open(basePath + baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX + '.' + Tomahawk::Constants::OUTPUT_LD_SORT_INDEX_SUFFIX);
- toi_header_type headIndex(Tomahawk::Constants::WRITE_HEADER_LD_SORT_MAGIC, this->reader.header.samples, this->reader.header.n_contig);
- headIndex.controller.sorted = 0;
- headIndex.controller.expanded = this->reverse_entries ? 1 : 0;
- headIndex.controller.partial_sort = 1;
- toi_writer.getNativeStream() << headIndex;
- // writer
- basic_writer_type& stream = *reinterpret_cast(this->reader.writer->getStream());
-
- if(memory_limit < 10e6){
- memory_limit = 10e6;
- std::cerr << Helpers::timestamp("SORT") << "Setting memory limit to 10 MB..." << std::endl;
- }
-
- if(this->reverse_entries){
- std::cerr << Helpers::timestamp("SORT") << "Reversing entries..." << std::endl;
- memory_limit /= 2;
- } else
- std::cerr << Helpers::timestamp("SORT") << "Not reversing..." << std::endl;
-
- // Perform indexed sorting if possible
- if(this->reader.hasIndex){
- return(this->__sortIndexed(toi_writer, input, memory_limit));
- }
-
- std::cerr << Helpers::timestamp("SORT") << "No index found..." << std::endl;
-
- bool trigger_break = false;
- U32 totempole_blocks_written = 0;
- while(true){
- if(!SILENT)
- std::cerr << Helpers::timestamp("LOG","SORT") << "Reading..." << std::endl;
-
- if(!this->reader.nextBlockUntil(memory_limit))
- trigger_break = true;
-
- if(this->reader.output_buffer.size() == 0){
- trigger_break = true;
- break;
- }
-
- assert((this->reader.output_buffer.size() % sizeof(entry_type)) == 0);
-
- totempole_entry totempole;
- if(this->reverse_entries)
- totempole.entries = 2*(this->reader.output_buffer.size() / sizeof(entry_type));
- else
- totempole.entries = this->reader.output_buffer.size() / sizeof(entry_type);
-
- if(this->reverse_entries){
- const entry_type* entry = nullptr;
- while(this->reader.nextVariantLimited(entry)){
- entry_type temp(entry);
- temp.swapDirection();
- this->reader.output_buffer.Add((char*)&temp, sizeof(entry_type));
- }
- }
-
- if(!SILENT)
- std::cerr << Helpers::timestamp("LOG","SORT") << "Sorting: " << Helpers::ToPrettyString(this->reader.output_buffer.size()/sizeof(entry_sort_type)) << " entries" << std::endl;
-
- std::sort(reinterpret_cast(&this->reader.output_buffer.data[0]),
- reinterpret_cast(&this->reader.output_buffer.data[this->reader.output_buffer.size()]));
-
- if(!SILENT)
- std::cerr << Helpers::timestamp("LOG","SORT") << "Indexing..." << std::endl;
-
- totempole.byte_offset = stream.getNativeStream().tellp();
- totempole.uncompressed_size = this->reader.output_buffer.size();
-
- this->reader.writer->write(this->reader.output_buffer);
- totempole.byte_offset_end = stream.getNativeStream().tellp();
- toi_writer.getNativeStream() << totempole;
-
- if(!SILENT)
- std::cerr << Helpers::timestamp("LOG","SORT") << "Writing..." << std::endl;
-
- ++totempole_blocks_written;
-
- if(trigger_break) break;
- }
-
- // Make sure TOI is flushed before re-opening and seeking
- toi_writer.flush();
-
- // TOI
- // Update blocks written
- std::fstream re(basePath + baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX + '.' + Tomahawk::Constants::OUTPUT_LD_SORT_INDEX_SUFFIX, std::ios::in | std::ios::out | std::ios::binary);
- if(!re.good()){
- std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to reopen index..." << std::endl;
- return false;
- }
-
- re.seekg(Tomahawk::Constants::WRITE_HEADER_LD_SORT_MAGIC_LENGTH + sizeof(float) + sizeof(U64) + sizeof(U32));
- if(!re.good()){
- std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to seek in index..." << std::endl;
- return false;
- }
-
- re.write((char*)&totempole_blocks_written, sizeof(U32));
- if(!re.good()){
- std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to update counts in index..." << std::endl;
- return false;
- }
- re.close();
-
- toi_writer.close();
-
- this->reader.writer->flush();
- this->reader.writer->close();
-
- return true;
-}
-
-bool TomahawkOutputSorter::__sortIndexed(basic_writer_type& toi_writer, const std::string& input, U64 memory_limit){
- std::cerr << Helpers::timestamp("SORT") << "Index found..." << std::endl;
-
- std::vector< totempole_entry > blocks;
-
- totempole_entry totempole;
- totempole.byte_offset = this->reader.toi_reader[0].byte_offset;
-
- U64 n_entries = 0;
- for(U32 i = totempole.uncompressed_size; i < this->reader.toi_reader.size(); ++i){
- if(totempole.uncompressed_size > memory_limit){
- totempole.byte_offset_end = this->reader.toi_reader[i].byte_offset;
- blocks.push_back(totempole);
- totempole.byte_offset = this->reader.toi_reader[i].byte_offset;
- totempole.entries = 0;
- totempole.uncompressed_size = 0;
- }
- totempole.entries += this->reader.toi_reader[i].entries;
- totempole.uncompressed_size += this->reader.toi_reader[i].uncompressed_size;
- n_entries += totempole.entries;
- }
-
- // Have to add final
- if(totempole.byte_offset != blocks.back().byte_offset){
- totempole.byte_offset_end = this->reader.toi_reader[this->reader.toi_reader.size() - 1].byte_offset_end;
- blocks.push_back(totempole);
- }
-
- if(totempole.entries != 0)
- blocks.push_back(totempole);
-
- // Todo: if n_threads > blocks.size()
- // set n_threads to block.size() and give each thread 1 block
-
- // Split workload into different threads
- // Each thread get approximately 1/threads amount of work
- std::vector< totempole_entry > thread_workload(this->n_threads);
- const U64 limit_thread = (U64)((double)blocks.size()/this->n_threads);
- U32 current_thread_target = 0;
- U64 n_blocks_loaded = 0;
- totempole.reset();
- totempole.byte_offset = blocks[0].byte_offset;
-
- for(U32 i = 0; i < blocks.size(); ++i){
- if(n_blocks_loaded >= limit_thread && current_thread_target + 1 != this->n_threads){
- n_blocks_loaded = 0;
- totempole.byte_offset_end = blocks[i].byte_offset;
- thread_workload[current_thread_target] = totempole;
- totempole.byte_offset = blocks[i].byte_offset;
- ++current_thread_target;
- }
- ++n_blocks_loaded;
- }
-
- // Add final
- if(current_thread_target != this->n_threads){
- totempole.byte_offset_end = blocks.back().byte_offset_end;
- thread_workload[this->n_threads-1] = totempole;
- }
-
- std::cerr << Helpers::timestamp("SORT") << "Spawning " << this->n_threads << " threads..." << std::endl;
- std::thread** slaves = new std::thread*[this->n_threads];
- slave_sorter** instances = new slave_sorter*[this->n_threads];
- for(U32 i = 0; i < this->n_threads; ++i){
- instances[i] = new slave_sorter(this->reader.writer, toi_writer, memory_limit);
- if(!instances[i]->open(input)){
- std::cerr << Helpers::timestamp("ERROR", "SORT") << "Failed to reopen file..." << std::endl;
- exit(1);
- }
-
- // Trigger reverse if applicable
- instances[i]->reverseEntries(this->reverse_entries);
- }
-
- for(U32 i = 0; i < this->n_threads; ++i)
- slaves[i] = instances[i]->start(thread_workload[i]);
-
- for(U32 i = 0; i < this->n_threads; ++i)
- slaves[i]->join();
-
- U32 totempole_blocks_written = 0;
- for(U32 i = 0; i < this->n_threads; ++i)
- totempole_blocks_written += instances[i]->getBlocksWritten();
-
- // TOI
- // Update blocks written
- // Make sure TOI is flushed before re-opening and seeking
- toi_writer.flush();
-
- std::fstream re(basePath + baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX + '.' + Tomahawk::Constants::OUTPUT_LD_SORT_INDEX_SUFFIX, std::ios::in | std::ios::out | std::ios::binary);
- if(!re.good()){
- std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to reopen index..." << std::endl;
- return false;
- }
-
- re.seekg(Tomahawk::Constants::WRITE_HEADER_LD_SORT_MAGIC_LENGTH + sizeof(float) + sizeof(U64) + sizeof(U32));
- if(!re.good()){
- std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to seek in index..." << std::endl;
- return false;
- }
-
- re.write((char*)&totempole_blocks_written, sizeof(U32));
- if(!re.good()){
- std::cerr << Helpers::timestamp("ERROR", "TWO") << "Failed to update counts in index..." << std::endl;
- return false;
- }
- re.close();
-
- toi_writer.close();
-
- this->reader.writer->flush();
- this->reader.writer->close();
-
- // Cleanup
- for(U32 i = 0; i < this->n_threads; ++i){
- delete instances[i];
- slaves[i] = nullptr;
- }
- delete [] instances;
- delete [] slaves;
-
- return true;
-}
-
-bool TomahawkOutputSorter::sortMerge(const std::string& inputFile, const std::string& destinationPrefix, const U32 block_size){
- if(!this->reader.Open(inputFile)){
- std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << inputFile << "..." << std::endl;
- return false;
- }
-
- if(!this->reader.hasIndex){
- std::cerr << Helpers::timestamp("ERROR","SORT") << "File is not indexed!" << std::endl;
- return false;
- }
-
- this->reader.addLiteral("\n##tomahawk_mergeSortCommand=" + Helpers::program_string());
-
- toi_header_type toi_header = this->reader.toi_reader.getHeader();
- if(toi_header.controller.sorted == true){
- std::cerr << Helpers::timestamp("ERROR","SORT") << "File is already sorted!" << std::endl;
- return false;
- }
-
- if(toi_header.controller.partial_sort == false){
- std::cerr << Helpers::timestamp("ERROR","SORT") << "File is not partially sorted!" << std::endl;
- return false;
- }
-
- toi_header.controller.partial_sort = false;
- toi_header.controller.sorted = true;
- writer_type writer(this->reader.contigs, &this->reader.header, toi_header);
-
- if(!writer.open(destinationPrefix)){
- std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open!" << std::endl;
- return false;
- }
- writer.writeHeader(this->reader.literals);
-
- const U32 n_toi_entries = this->reader.toi_reader.size();
- std::ifstream* streams = new std::ifstream[n_toi_entries];
- tgzf_iterator** iterators = new tgzf_iterator*[n_toi_entries];
-
- if(!SILENT)
- std::cerr << Helpers::timestamp("LOG", "SORT") << "Opening " << n_toi_entries << " file handles...";
-
- for(U32 i = 0; i < n_toi_entries; ++i){
- streams[i].open(inputFile);
- streams[i].seekg(this->reader.toi_reader[i].byte_offset);
- iterators[i] = new tgzf_iterator(streams[i], 65536, this->reader.toi_reader[i].byte_offset, this->reader.toi_reader[i].byte_offset_end);
- }
-
- if(!SILENT)
- std::cerr << " Done!" << std::endl;
-
- // queue
- queue_type outQueue;
-
- //
- if(!SILENT)
- std::cerr << Helpers::timestamp("LOG", "SORT") << "Merging..." << std::endl;
-
- // draw one from each
- const entry_type* e = nullptr;
- for(U32 i = 0; i < n_toi_entries; ++i){
- if(!iterators[i]->nextEntry(e)){
- std::cerr << Helpers::timestamp("ERROR", "SORT") << "Failed to get an entry..." << std::endl;
- return false;
- }
- outQueue.push( queue_entry(e, i, IO::Support::TomahawkOutputEntryCompFuncConst) );
- }
-
- if(outQueue.empty()){
- std::cerr << Helpers::timestamp("ERROR","SORT") << "No data in queue..." << std::endl;
- return false;
- }
-
- // while queue is not empty
- while(outQueue.empty() == false){
- // peek at top entry in queue
- const U32 id = outQueue.top().streamID;
- writer << outQueue.top().data;
-
- // remove this record from the queue
- outQueue.pop();
-
-
- while(iterators[id]->nextEntry(e)){
- if(!(*e < outQueue.top().data)){
- outQueue.push( queue_entry(e, id, IO::Support::TomahawkOutputEntryCompFuncConst) );
- break;
- }
- writer << *e;
- }
- }
-
- writer.flush();
- if(!writer.finalize(toi_header.controller.expanded)){
- std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to finalize index..." << std::endl;
- return false;
- }
-
- writer.close();
-
- // Temp
- //index_type& index = writer.getIndex();
- //std::cerr << index << std::endl;
-
- // Cleanup
- for(U32 i = 0; i < n_toi_entries; ++i)
- delete iterators[i];
-
- delete [] iterators;
- delete [] streams;
-
- return true;
-}
-
-}
-}
-}
diff --git a/src/algorithm/sort/TomahawkOutputSort.h b/src/algorithm/sort/TomahawkOutputSort.h
deleted file mode 100644
index 52758ad..0000000
--- a/src/algorithm/sort/TomahawkOutputSort.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef TOMAHAWKOUTPUTSORT_H_
-#define TOMAHAWKOUTPUTSORT_H_
-
-#include
-#include
-
-#include "../../io/compression/TGZFEntryIterator.h"
-#include "../../totempole/TotempoleOutputEntry.h"
-#include "../../tomahawk/TomahawkOutput/TomahawkOutputReader.h"
-#include "../../tomahawk/TomahawkOutput/TomahawkOutputManager.h"
-#include "TomahawkOutputSortMergeQueueContainer.h"
-#include "TomahawkOutputSortSlave.h"
-
-namespace Tomahawk{
-namespace Algorithm{
-namespace Output{
-
-// Sorter
-class TomahawkOutputSorter{
- typedef IO::TomahawkOutputEntry entry_type;
- typedef IO::TomahawkOutputEntrySort entry_sort_type;
- typedef TomahawkOutputSorter self_type;
- typedef TomahawkOutputSortMergeQueueContainer queue_entry;
- typedef std::priority_queue< queue_entry > queue_type; // prio queue
- typedef IO::TomahawkOutputReader two_reader_type;
- typedef Totempole::TotempoleOutputEntry totempole_entry;
- typedef IO::TomahawkOutputWriterIndex writer_type;
- typedef IO::WriterFile basic_writer_type;
- typedef TomahawkOutputSortSlave slave_sorter;
- typedef IO::TGZFEntryIterator tgzf_iterator;
- typedef Totempole::TotempoleOutputSortedIndex index_type;
- typedef Tomahawk::IO::TomahawkOutputSortHeader toi_header_type;
-
-public:
- TomahawkOutputSorter() : n_threads(std::thread::hardware_concurrency()), reverse_entries(true){}
- ~TomahawkOutputSorter(){}
-
- bool sort(const std::string& input, const std::string& destinationPrefix, U64 memory_limit);
- bool sortMerge(const std::string& input, const std::string& destinationPrefix, const U32 block_size);
-
-private:
- bool __sortUnindexed();
- bool __sortIndexed(basic_writer_type& toi_writer, const std::string& input, U64 memory_limit);
-
-private:
- two_reader_type reader;
-
-public:
- U32 n_threads;
- bool reverse_entries;
- std::string baseName;
- std::string basePath;
-};
-
-
-}
-}
-}
-
-#endif /* TOMAHAWKOUTPUTSORT_H_ */
diff --git a/src/algorithm/sort/TomahawkOutputSortSlave.h b/src/algorithm/sort/TomahawkOutputSortSlave.h
deleted file mode 100644
index 9d46b6e..0000000
--- a/src/algorithm/sort/TomahawkOutputSortSlave.h
+++ /dev/null
@@ -1,128 +0,0 @@
-#ifndef TOMAHAWKOUTPUTSORTSLAVE_H_
-#define TOMAHAWKOUTPUTSORTSLAVE_H_
-
-#include
-
-namespace Tomahawk{
-namespace Algorithm{
-namespace Output{
-
-class TomahawkOutputSortSlave{
- typedef TomahawkOutputSortSlave self_type;
- typedef IO::TomahawkOutputEntry entry_type;
- typedef IO::TomahawkOutputEntrySort entry_sort_type;
- typedef Totempole::TotempoleOutputEntry totempole_entry;
- typedef IO::WriterFile writer_type;
- typedef IO::TomahawkOutputReader two_reader_type;
- typedef Tomahawk::IO::TomahawkOutputWriterInterface two_writer_interface;
- typedef Tomahawk::IO::TomahawkOutputWriter two_writer_type;
- typedef IO::TGZFController tgzf_controller_type;
-
-public:
- TomahawkOutputSortSlave(two_writer_interface* writer, writer_type& toi_writer, const U32 memory_limit) :
- memory_limit(memory_limit),
- blocks_written(0),
- writer(reinterpret_cast(writer)),
- toi_writer(toi_writer),
- reverse_entries(true)
- {}
- ~TomahawkOutputSortSlave(){}
-
- inline void reverseEntries(const bool yes = true){ this->reverse_entries = yes; }
-
- bool open(const std::string& input){
- if(!this->reader.Open(input)){
- std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << input << "..." << std::endl;
- return false;
- }
-
- return true;
- }
-
- std::thread* start(const totempole_entry& workload){
- this->thread = std::thread(&self_type::sort, this, workload);
- return(&this->thread);
- }
-
- inline const U32& getBlocksWritten(void) const{ return(this->blocks_written); }
-
-private:
- bool sort(const totempole_entry& workload){
- bool trigger_break = false;
- writer_type& stream = *reinterpret_cast(this->writer->getStream());
- this->reader.stream.seekg(workload.byte_offset);
- if(!this->reader.stream.good()){
- std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to seek in file..." << std::endl;
- return false;
- }
-
- totempole_entry totempole;
- while(true){
- if(this->reader.stream.tellg() == workload.byte_offset_end)
- break;
-
- if(!this->reader.nextBlockUntil(this->memory_limit, workload.byte_offset_end))
- trigger_break = true;
-
-
- if(this->reader.output_buffer.size() == 0){
- trigger_break = true;
- break;
- }
-
- assert((this->reader.output_buffer.size() % sizeof(entry_type)) == 0);
-
- if(this->reverse_entries){
- const entry_type* entry = nullptr;
- totempole.entries += 2*((this->reader.output_buffer.size() % sizeof(entry_type)));
- while(this->reader.nextVariantLimited(entry)){
- // Flip cA,pA with cB,pB
- entry_type temp(entry);
- temp.swapDirection();
- this->reader.output_buffer.Add((char*)&temp, sizeof(entry_type));
- }
- } else {
- // Do not reverse
- totempole.entries = (this->reader.output_buffer.size() % sizeof(entry_type));
- }
-
- std::sort(reinterpret_cast(&this->reader.output_buffer.data[0]),
- reinterpret_cast(&this->reader.output_buffer.data[this->reader.output_buffer.size()]));
-
- totempole.reset();
- totempole.uncompressed_size = this->reader.output_buffer.size();
-
- this->controller.Clear();
- this->controller.Deflate(this->reader.output_buffer);
-
- this->writer->getLock()->lock();
- totempole.byte_offset = stream.getNativeStream().tellp();
- this->writer->getStream()->writeNoLock(this->controller.buffer.data, this->controller.buffer.pointer);
- ++this->blocks_written;
- totempole.byte_offset_end = stream.getNativeStream().tellp();
- toi_writer.getNativeStream() << totempole;
- this->writer->getLock()->unlock();
-
- if(trigger_break) break;
- }
-
- return true;
- }
-
-private:
- const U32 memory_limit;
- U32 blocks_written;
- two_writer_type* writer;
- writer_type& toi_writer;
- two_reader_type reader;
- std::thread thread;
- tgzf_controller_type controller;
- bool reverse_entries;
-};
-
-
-}
-}
-}
-
-#endif /* TOMAHAWKOUTPUTSORTSLAVE_H_ */
diff --git a/src/algorithm/sort/TomahawkOutputSortMergeQueueContainer.h b/src/algorithm/sort/output_sort_merge_queue.h
similarity index 63%
rename from src/algorithm/sort/TomahawkOutputSortMergeQueueContainer.h
rename to src/algorithm/sort/output_sort_merge_queue.h
index bfc2476..6c5cfed 100644
--- a/src/algorithm/sort/TomahawkOutputSortMergeQueueContainer.h
+++ b/src/algorithm/sort/output_sort_merge_queue.h
@@ -3,17 +3,16 @@
namespace Tomahawk{
namespace Algorithm{
-namespace Output{
-
template
-struct TomahawkOutputSortMergeQueueContainer {
+struct OutputSortMergeQueue {
+private:
typedef T entry_type;
- typedef TomahawkOutputSortMergeQueueContainer self_type;
+ typedef OutputSortMergeQueue self_type;
public:
- TomahawkOutputSortMergeQueueContainer(const entry_type* data,
- U32 streamID,
- bool (*compFunc)(const entry_type& a, const entry_type& b) = T::operator<)
+ OutputSortMergeQueue(const entry_type* data,
+ U32 streamID,
+ bool (*compFunc)(const entry_type& a, const entry_type& b) = T::operator<)
: streamID(streamID)
, data(*data)
, compFunc(compFunc)
@@ -29,7 +28,6 @@ struct TomahawkOutputSortMergeQueueContainer {
bool (*compFunc)(const entry_type& a, const entry_type& b);
};
-}
}
}
diff --git a/src/algorithm/sort/output_sort_slave.h b/src/algorithm/sort/output_sort_slave.h
new file mode 100644
index 0000000..be775b3
--- /dev/null
+++ b/src/algorithm/sort/output_sort_slave.h
@@ -0,0 +1,120 @@
+#ifndef TOMAHAWKOUTPUTSORTSLAVE_H_
+#define TOMAHAWKOUTPUTSORTSLAVE_H_
+
+#include
+
+#include "../../io/output_writer.h"
+
+namespace Tomahawk{
+namespace Algorithm{
+
+/**<
+ * Worker slave for partially sorting a (large) `two` file
+ */
+class OutputSortSlave {
+private:
+ typedef OutputSortSlave self_type;
+ typedef IO::OutputEntry entry_type;
+ typedef IO::OutputWriter writer_type;
+ typedef TomahawkOutputReader reader_type;
+ typedef IO::TGZFController tgzf_controller_type;
+ typedef IO::BasicBuffer buffer_type;
+ typedef IO::TGZFEntryIterator tgzf_iterator;
+
+public:
+ OutputSortSlave(reader_type& reader, writer_type& writer, const std::pair& workload, const U32 memory_limit) :
+ workload_(workload),
+ n_memory_limit_(memory_limit),
+ reader_(reader),
+ writer_(writer)
+ {}
+
+ ~OutputSortSlave(){
+ this->inflate_buffer_.deleteAll();
+ this->data_.deleteAll();
+ }
+
+ bool open(const std::string& input){
+ this->stream_.open(input, std::ios::binary | std::ios::in);
+ if(this->stream_.good() == false){
+ std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << input << "..." << std::endl;
+ return false;
+ }
+
+ return true;
+ }
+
+ std::thread* start(void){
+ this->thread_ = std::thread(&self_type::sort, this);
+ return(&this->thread_);
+ }
+
+ inline const writer_type& getWriter(void) const{ return(this->writer_); }
+
+private:
+ bool sort(void){
+ if(!this->stream_.seekg(this->workload_.first)){
+ std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to seek to block!" << std::endl;
+ exit(1); // exit instead of return because of detached threads
+ }
+
+ // iterator
+ const U64 n_entries_limit = this->n_memory_limit_ / sizeof(entry_type);
+ tgzf_iterator it(this->stream_, 65536, this->workload_.first, this->workload_.second);
+ bool finished_ = false;
+
+ if(!this->stream_.good()){
+ std::cerr << Helpers::timestamp("ERROR","SORT") << "Stream is bad!" << std::endl;
+ exit(1); // exit instead of return because of detached threads
+ }
+
+ while(true){
+ OutputContainer container(this->n_memory_limit_ / sizeof(entry_type) + 1024);
+
+ const entry_type* e = nullptr;
+ for(U32 i = 0; i < n_entries_limit; ++i){
+ if(!it.nextEntry(e)){
+ finished_ = true;
+ break;
+ }
+ container += *e;
+ }
+
+ std::sort(&container.front(), &container.back());
+
+ const entry_type* prev = &container[0];
+ for(size_t j = 1; j < container.size(); ++j){
+ if(*prev >= container[j]){
+ std::cerr << j-1 << ',' << j << std::endl;
+ std::cerr << *prev << std::endl;
+ std::cerr << container[j] << std::endl;
+ exit(1);
+ }
+ prev = &container[j];
+ }
+
+ this->writer_ << container;
+ if(finished_)
+ break;
+ }
+
+ return true;
+ }
+
+private:
+ std::ifstream stream_;
+ std::pair workload_;
+ const U32 n_memory_limit_;
+ const reader_type& reader_;
+ writer_type writer_;
+ std::thread thread_;
+ buffer_type inflate_buffer_;
+ buffer_type data_;
+ tgzf_controller_type compression_manager_;
+};
+
+
+}
+}
+
+#endif /* TOMAHAWKOUTPUTSORTSLAVE_H_ */
diff --git a/src/algorithm/sort/output_sorter.cpp b/src/algorithm/sort/output_sorter.cpp
new file mode 100644
index 0000000..c9f28de
--- /dev/null
+++ b/src/algorithm/sort/output_sorter.cpp
@@ -0,0 +1,206 @@
+#include
+
+#include "output_sorter.h"
+
+namespace Tomahawk{
+namespace Algorithm{
+
+// Algorithmic sketch:
+// 1: Load data into balanced chunks of memory_limit bytes
+// 2: Load partitioned data into containers
+// 3: Perform sort
+// 4: Perform merge if desired
+bool OutputSorter::sort(const std::string& input, const std::string& destinationPrefix, U64 memory_limit){
+ if(!this->reader.open(input)){
+ std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << input << "..." << std::endl;
+ return false;
+ }
+
+ if(this->reader.getIndex().getController().isSorted){
+ std::cerr << Helpers::timestamp("LOG","SORT") << "File is already sorted..." << std::endl;
+ return true;
+ }
+
+ //std::cerr << this->reader.getIndex().totalBytes() << "->" << this->reader.getIndex().totalBytes() / this->n_threads << std::endl;
+ const U64 n_variants_chunk = this->reader.getIndex().totalBytes() / this->n_threads;
+
+ std::pair* thread_distribution = new std::pair[this->n_threads];
+ size_t i = 0;
+ U32 t = 0;
+ for(; t < this->n_threads; ++t){
+ U64 partition_size = 0;
+ const size_t from = i;
+ for(; i < this->reader.getIndex().size(); ++i){
+ partition_size += this->reader.getIndex().getContainer().at(i).sizeBytes();
+ if(partition_size >= n_variants_chunk && t + 1 != this->n_threads){
+ ++i;
+ break;
+ }
+ }
+ thread_distribution[t].first = this->reader.getIndex().getContainer().at(from).byte_offset;
+ thread_distribution[t].second = this->reader.getIndex().getContainer().at(i).byte_offset;
+
+ if(i == this->reader.getIndex().size()){
+ //std::cerr << "ran out of data" << std::endl;
+ thread_distribution[t].second = this->reader.getIndex().getContainer().back().byte_offset_end;
+ //std::cerr << "t: " << from << "->" << this->reader.getIndex().getContainer().size() << " (" << thread_distribution[t].first << "->" << thread_distribution[t].second << ") for " << partition_size << "/" << n_variants_chunk << std::endl;
+ ++t;
+ break;
+ }
+ //std::cerr << "t: " << from << "->" << i << " (" << thread_distribution[t].first << "->" << thread_distribution[t].second << ") for " << partition_size << "/" << n_variants_chunk << std::endl;
+ }
+ const U32 active_threads = t;
+
+ // Append executed command to literals
+ this->reader.getHeader().getLiterals() += "\n##tomahawk_sortCommand=" + Helpers::program_string();
+
+ IO::OutputWriter writer;
+ if(!writer.open(destinationPrefix)){
+ std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << destinationPrefix << "..." << std::endl;
+ return false;
+ }
+ writer.writeHeaders(this->reader.getHeader());
+
+ if(!SILENT)
+ std::cerr << Helpers::timestamp("LOG","SORT") << "Spawning: " << active_threads << " workers..." << std::endl;
+
+ OutputSortSlave** slaves = new OutputSortSlave*[active_threads];
+ std::thread** threads = new std::thread*[active_threads];
+
+ for(U32 i = 0; i < active_threads; ++i){
+ slaves[i] = new OutputSortSlave(this->reader, writer, thread_distribution[i], memory_limit);
+ if(!slaves[i]->open(input)){
+ std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << input << "..." << std::endl;
+ return false;
+ }
+ threads[i] = slaves[i]->start();
+ }
+
+ for(U32 i = 0; i < active_threads; ++i)
+ threads[i]->join();
+
+ for(U32 i = 0; i < active_threads; ++i)
+ writer += slaves[i]->getWriter();
+
+ writer.setSorted(false);
+ writer.setPartialSorted(true);
+ writer.flush();
+ writer.writeFinal();
+
+ if(!SILENT)
+ std::cerr << Helpers::timestamp("LOG") << "Output: " << Helpers::ToPrettyString(writer.sizeEntries()) << " entries into " << Helpers::ToPrettyString(writer.sizeBlocks()) << " blocks..." << std::endl;
+
+ for(U32 i = 0; i < active_threads; ++i)
+ delete slaves[i];
+
+ delete [] slaves;
+ delete [] threads;
+ delete [] thread_distribution;
+
+ return true;
+}
+
+bool OutputSorter::sortMerge(const std::string& inputFile, const std::string& destinationPrefix, const U32 block_size){
+ if(!this->reader.open(inputFile)){
+ std::cerr << Helpers::timestamp("ERROR","SORT") << "Failed to open: " << inputFile << "..." << std::endl;
+ return false;
+ }
+
+ if(this->reader.getIndex().getController().isSorted){
+ std::cerr << Helpers::timestamp("LOG","SORT") << "File is already sorted..." << std::endl;
+ return true;
+ }
+
+ if(this->reader.getIndex().getController().isPartialSorted == false){
+ std::cerr << Helpers::timestamp("ERROR","SORT") << "File is not partially sorted..." << std::endl;
+ return false;
+ }
+
+ // Append executed command to literals
+ this->reader.getHeader().getLiterals() += "\n##tomahawk_mergeSortCommand=" + Helpers::program_string();
+
+ IO::OutputWriter writer;
+ if(!writer.open(destinationPrefix)){
+ std::cerr << Helpers::timestamp("ERROR", "SORT") << "Failed to open: " << destinationPrefix << "..." << std::endl;
+ return false;
+ }
+ writer.setFlushLimit(block_size);
+ writer.writeHeaders(this->reader.getHeader());
+
+ const U32 n_toi_entries = this->reader.getIndex().size();
+ std::ifstream* streams = new std::ifstream[n_toi_entries];
+ tgzf_iterator** iterators = new tgzf_iterator*[n_toi_entries];
+
+ if(!SILENT)
+ std::cerr << Helpers::timestamp("LOG", "SORT") << "Opening " << n_toi_entries << " file handles...";
+
+ for(U32 i = 0; i < n_toi_entries; ++i){
+ streams[i].open(inputFile);
+ streams[i].seekg(this->reader.getIndex().getContainer()[i].byte_offset);
+ iterators[i] = new tgzf_iterator(streams[i], 65536, this->reader.getIndex().getContainer()[i].byte_offset, this->reader.getIndex().getContainer()[i].byte_offset_end);
+ }
+
+ if(!SILENT)
+ std::cerr << " Done!" << std::endl;
+
+ // queue
+ queue_type outQueue;
+
+ //
+ if(!SILENT)
+ std::cerr << Helpers::timestamp("LOG", "SORT") << "Merging..." << std::endl;
+
+ // draw one from each
+ const entry_type* e = nullptr;
+ for(U32 i = 0; i < n_toi_entries; ++i){
+ if(!iterators[i]->nextEntry(e)){
+ std::cerr << Helpers::timestamp("ERROR", "SORT") << "Failed to get an entry..." << std::endl;
+ return false;
+ }
+ outQueue.push( queue_entry(e, i, entry_type::sortAscending) );
+ }
+
+ if(outQueue.empty()){
+ std::cerr << Helpers::timestamp("ERROR","SORT") << "No data in queue..." << std::endl;
+ return false;
+ }
+
+ // while queue is not empty
+ while(outQueue.empty() == false){
+ // peek at top entry in queue
+ const U32 id = outQueue.top().streamID;
+ writer << outQueue.top().data;
+
+ // remove this record from the queue
+ outQueue.pop();
+
+
+ while(iterators[id]->nextEntry(e)){
+ if(!(*e < outQueue.top().data)){
+ outQueue.push( queue_entry(e, id, entry_type::sortAscending) );
+ break;
+ }
+ writer << *e;
+ }
+ }
+
+ writer.setPartialSorted(false);
+ writer.setSorted(true);
+ writer.flush();
+ writer.writeFinal();
+
+ if(!SILENT)
+ std::cerr << Helpers::timestamp("LOG") << "Output: " << Helpers::ToPrettyString(writer.sizeEntries()) << " entries into " << Helpers::ToPrettyString(writer.sizeBlocks()) << " blocks..." << std::endl;
+
+ // Cleanup
+ for(U32 i = 0; i < n_toi_entries; ++i)
+ delete iterators[i];
+
+ delete [] iterators;
+ delete [] streams;
+
+ return true;
+}
+
+}
+}
diff --git a/src/algorithm/sort/output_sorter.h b/src/algorithm/sort/output_sorter.h
new file mode 100644
index 0000000..bceb56d
--- /dev/null
+++ b/src/algorithm/sort/output_sorter.h
@@ -0,0 +1,61 @@
+#ifndef TOMAHAWKOUTPUTSORT_H_
+#define TOMAHAWKOUTPUTSORT_H_
+
+#include
+#include
+
+#include "../../io/compression/TGZFEntryIterator.h"
+#include "../../tomahawk/two/TomahawkOutputReader.h"
+#include "output_sort_merge_queue.h"
+#include "output_sort_slave.h"
+
+namespace Tomahawk{
+namespace Algorithm{
+
+/**<
+ * Primary class for sorting `TWO` data
+ */
+class OutputSorter{
+ typedef IO::OutputEntry entry_type;
+ typedef TomahawkOutputReader two_reader_type;
+ typedef IO::TGZFEntryIterator tgzf_iterator;
+ typedef OutputSorter self_type;
+ typedef OutputSortMergeQueue queue_entry;
+ typedef std::priority_queue queue_type; // priority queue
+
+public:
+ OutputSorter() : n_threads(std::thread::hardware_concurrency()){}
+ ~OutputSorter(){}
+
+ /**<
+ * Standard sorting approach
+ * @param input
+ * @param destinationPrefix
+ * @param memory_limit
+ * @return
+ */
+ bool sort(const std::string& input, const std::string& destinationPrefix, U64 memory_limit);
+
+ /**<
+ * N-way merge of paralell-sorted blocks
+ * @param input
+ * @param destinationPrefix
+ * @param block_size
+ * @return
+ */
+ bool sortMerge(const std::string& input, const std::string& destinationPrefix, const U32 block_size);
+
+ inline const size_t size(void) const{ return(this->n_threads); }
+
+private:
+ two_reader_type reader;
+
+public:
+ size_t n_threads;
+};
+
+
+}
+}
+
+#endif /* TOMAHAWKOUTPUTSORT_H_ */
diff --git a/src/calc.h b/src/calc.h
index dc0ce6a..b78caa1 100644
--- a/src/calc.h
+++ b/src/calc.h
@@ -231,6 +231,9 @@ int calc(int argc, char** argv){
return 1;
}
+ //Tomahawk::Totempole::TotempoleReader totempole_reader;
+ //Tomahawk::TomahawkReader tomahawk_reader;
+
//std::vector rets = totempole.findOverlaps(Tomahawk::Interval(0, 2221297, 10108169));
//std::cerr << "Found overlaps: " << rets.size() << std::endl;
//for(U32 i = 0; i < rets.size(); ++i)
diff --git a/src/concat.h b/src/concat.h
index f8ed2b1..d1a02d3 100644
--- a/src/concat.h
+++ b/src/concat.h
@@ -22,8 +22,8 @@ DEALINGS IN THE SOFTWARE.
*/
#include
+#include "tomahawk/two/TomahawkOutputReader.h"
#include "utility.h"
-#include "tomahawk/TomahawkOutput/TomahawkOutputReader.h"
void concat_usage(void){
programMessage();
@@ -108,17 +108,21 @@ int concat(int argc, char** argv){
std::cerr << Tomahawk::Helpers::timestamp("LOG") << "Calling concat..." << std::endl;
}
- Tomahawk::IO::TomahawkOutputReader reader;
+ Tomahawk::TomahawkOutputReader reader;
if(input.size() == 0){
+ /*
if(!reader.concat(files, output)){
std::cerr << Tomahawk::Helpers::timestamp("ERROR", "CONCAT") << "Failed to concat files!" << std::endl;
return 1;
}
+ */
} else {
+ /*
if(!reader.concat(input, output)){
std::cerr << Tomahawk::Helpers::timestamp("ERROR", "CONCAT") << "Failed to concat files!" << std::endl;
return 1;
}
+ */
}
return 0;
diff --git a/src/import.h b/src/import.h
index a5c3160..8a9b7d5 100644
--- a/src/import.h
+++ b/src/import.h
@@ -151,9 +151,10 @@ int import(int argc, char** argv){
std::cerr << Tomahawk::Helpers::timestamp("LOG") << "Calling import..." << std::endl;
}
+
Tomahawk::TomahawkImporter importer(input, output);
- importer.getFilters().HWE_P = hwe_p;
- importer.getFilters().MAF = maf;
+ importer.getFilters().HWE_P = hwe_p;
+ importer.getFilters().MAF = maf;
importer.getFilters().missingness = missingness;
if(!extension_mode){
diff --git a/src/index.h b/src/index.h
deleted file mode 100644
index 9e708c2..0000000
--- a/src/index.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
-Copyright (C) 2016-2017 Genome Research Ltd.
-Author: Marcus D. R. Klarqvist
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-#include "utility.h"
-#include "tomahawk/TomahawkReader.h"
-#include "totempole/TotempoleReader.h"
-#include "tomahawk/TomahawkOutput/TomahawkOutputReader.h"
-
-int index(int argc, char** argv){
- argc -= 2; argv += 2;
- programMessage();
- std::cerr << Tomahawk::Helpers::timestamp("LOG") << "Calling index..." << std::endl;
-
- if(argc < 1){
- std::cerr << argc << std::endl;
- std::cerr << Tomahawk::Helpers::timestamp("ERROR") << "Missing parameters" << std::endl;
- return(1);
- }
-
- std::string inputFile(&argv[0][0]);
-
- // Parse file suffix
- std::vector paths = Tomahawk::Helpers::splitLastOf(inputFile, '/', true);
- std::vector files = Tomahawk::Helpers::splitLastOf(paths[1], '.');
-
- // Todo: if failed to read from file suffix: try to look into file header MAGIC
- if(files[1].size() == 0){
- std::cerr << "could not determine file type from suffix" << std::endl;
- return false;
- }
-
- std::transform(files[1].begin(), files[1].end(), files[1].begin(), ::tolower);
-
- if(files[1] == Tomahawk::Constants::OUTPUT_SUFFIX){
- std::cerr << Tomahawk::Helpers::timestamp("ERROR","INDEX") << "Twk files are already indexed..." << std::endl;
- } else if(files[1] == Tomahawk::Constants::OUTPUT_LD_SUFFIX) {
- Tomahawk::IO::TomahawkOutputReader reader;
-
- if(!reader.index(inputFile)){
- std::cerr << Tomahawk::Helpers::timestamp("ERROR", "INDEX") << "Failed to index file!" << std::endl;
- return 1;
- }
- } else {
- std::cerr << "Unknown file type" << std::endl;
- }
-
- return 0;
-}
diff --git a/src/index/footer.h b/src/index/footer.h
new file mode 100644
index 0000000..e11da60
--- /dev/null
+++ b/src/index/footer.h
@@ -0,0 +1,73 @@
+#ifndef TOTEMPOLEHEADER_H_
+#define TOTEMPOLEHEADER_H_
+
+#include
+
+#include "../support/MagicConstants.h"
+
+namespace Tomahawk {
+namespace Totempole {
+
+#define TWK_FOOTER_LENGTH (Constants::eof_length + sizeof(U32) + sizeof(U64))
+
+struct Footer{
+public:
+ typedef Footer self_type;
+
+public:
+ Footer() :
+ offset_end_of_data(0),
+ l_largest_uncompressed(0)
+ {
+ Helpers::HexToBytes(Constants::eof_hex, &this->EOF_marker[0]);
+ }
+
+ Footer(const char* const data) :
+ offset_end_of_data(*reinterpret_cast(data)),
+ l_largest_uncompressed(*reinterpret_cast(&data[sizeof(U64)]))
+ {
+ memcpy(&this->EOF_marker[0], &data[sizeof(U64)+sizeof(U32)], Constants::eof_length);
+ }
+
+ ~Footer() = default;
+
+ inline const U64& getEODPosition(void) const{ return(this->offset_end_of_data); }
+ inline const U32& getLargestUncompressedBlock(void) const{ return(this->l_largest_uncompressed); }
+ inline U64& getEODPosition(void){ return(this->offset_end_of_data); }
+ inline U32& getLargestUncompressedBlock(void){ return(this->l_largest_uncompressed); }
+
+ inline const bool validate(void) const{
+ if(this->offset_end_of_data == 0) return false;
+ if(this->l_largest_uncompressed == 0) return false;
+
+ BYTE reference[Constants::eof_length];
+ Helpers::HexToBytes(Constants::eof_hex, &reference[0]);
+
+ if(strncmp(reinterpret_cast(&this->EOF_marker[0]), reinterpret_cast(&reference[0]), Constants::eof_length) != 0) return false;
+ return true;
+ }
+
+ friend std::ostream& operator<<(std::ostream& stream, const self_type& footer){
+ stream.write(reinterpret_cast(&footer.offset_end_of_data), sizeof(U64));
+ stream.write(reinterpret_cast(&footer.l_largest_uncompressed), sizeof(U32));
+ stream.write(reinterpret_cast(&footer.EOF_marker[0]), Constants::eof_length);
+ return(stream);
+ }
+
+ friend std::istream& operator>>(std::istream& stream, self_type& footer){
+ stream.read(reinterpret_cast(&footer.offset_end_of_data), sizeof(U64));
+ stream.read(reinterpret_cast(&footer.l_largest_uncompressed), sizeof(U32));
+ stream.read(reinterpret_cast(&footer.EOF_marker[0]), Constants::eof_length);
+ return(stream);
+ }
+
+public:
+ U64 offset_end_of_data; // number of blocks in Tomahawk
+ U32 l_largest_uncompressed; // largest block-size in bytes
+ BYTE EOF_marker[Constants::eof_length];
+};
+
+}
+}
+
+#endif /* TOTEMPOLEHEADER_H_ */
diff --git a/src/index/index.cpp b/src/index/index.cpp
new file mode 100644
index 0000000..d88f79a
--- /dev/null
+++ b/src/index/index.cpp
@@ -0,0 +1,67 @@
+#include "index.h"
+
+namespace Tomahawk{
+
+Index::Index(){}
+Index::~Index(){}
+
+// Reading an index from a byte stream
+Index::Index(const char* const data, const U32 l_data) :
+ controller_(data),
+ meta_container_(&data[sizeof(BYTE)], *reinterpret_cast(&data[sizeof(BYTE)])*TWK_INDEX_META_ENTRY_SIZE+sizeof(size_type)),
+ container_(&data[sizeof(BYTE) + this->meta_container_.size() * TWK_INDEX_META_ENTRY_SIZE + sizeof(size_type)], l_data - (this->meta_container_.size() * TWK_INDEX_META_ENTRY_SIZE + sizeof(size_type) + sizeof(BYTE)))
+{
+
+}
+
+bool Index::buildMetaIndex(const U32 n_contigs){
+ if(this->getContainer().size() == 0)
+ return false;
+
+ if(this->isSorted() == false)
+ return false;
+
+ meta_entry_type reference_entry;
+ reference_entry.index_begin = 0;
+ reference_entry.index_end = 1;
+ reference_entry.min_position = this->getContainer()[0].min_position;
+ reference_entry.max_position = this->getContainer()[0].max_position;
+ reference_entry.n_variants = this->getContainer()[0].n_variants;
+ reference_entry.uncompressed_size = this->getContainer()[0].uncompressed_size;
+ U32 reference_contig = this->getContainer()[0].contigID;
+
+ meta_entry_type* temp_entries = new meta_entry_type[n_contigs];
+
+ for(U32 i = 1; i < this->getContainer().size(); ++i){
+ if(this->getContainer()[i].contigID != reference_contig){
+ if(this->getContainer()[i].contigID < reference_contig)
+ continue;
+
+ temp_entries[reference_contig] = reference_entry;
+ reference_contig = this->getContainer()[i].contigID;
+ reference_entry.index_begin = i;
+ reference_entry.index_end = i + 1;
+ reference_entry.min_position = this->getContainer()[i].min_position;
+ reference_entry.max_position = this->getContainer()[i].max_position;
+ reference_entry.n_variants = this->getContainer()[i].n_variants;
+ reference_entry.uncompressed_size = this->getContainer()[i].uncompressed_size;
+
+ } else {
+ ++reference_entry.index_end;
+ reference_entry.max_position = this->getContainer()[i].max_position;
+ reference_entry.n_variants += this->getContainer()[i].n_variants;
+ reference_entry.uncompressed_size += this->getContainer()[i].uncompressed_size;
+ }
+ }
+ temp_entries[reference_contig] = reference_entry;
+
+ for(U32 i = 0; i < n_contigs; ++i){
+ //std::cerr << temp_entries[i] << std::endl;
+ this->meta_container_ += temp_entries[i];
+ }
+ delete [] temp_entries;
+
+ return(true);
+}
+
+}
diff --git a/src/index/index.h b/src/index/index.h
new file mode 100644
index 0000000..aa58d1c
--- /dev/null
+++ b/src/index/index.h
@@ -0,0 +1,128 @@
+#ifndef INDEX_INDEX_H_
+#define INDEX_INDEX_H_
+
+#include "index_contig.h"
+#include "index_container.h"
+#include "index_meta_container.h"
+#include "../io/BasicBuffer.h"
+#include "footer.h"
+
+namespace Tomahawk{
+
+/**<
+ * Index controller for bit flags
+ */
+struct IndexController{
+public:
+ typedef IndexController self_type;
+
+public:
+ IndexController() :
+ isSorted(false),
+ isPartialSorted(false),
+ unused(0)
+ {}
+
+ IndexController(const char* const data){ memcpy(this, data, sizeof(BYTE)); }
+
+private:
+ friend std::ofstream& operator<<(std::ofstream& stream, const self_type& controller){
+ stream.write((const char*)&controller, sizeof(BYTE));
+ return stream;
+ }
+
+ friend std::ifstream& operator>>(std::ifstream& stream, self_type& controller){
+ stream.read((char*)&controller, sizeof(BYTE));
+ return stream;
+ }
+
+public:
+ BYTE isSorted: 1,
+ isPartialSorted: 1,
+ unused: 6;
+};
+
+/**<
+ * This container handles the index entries for `twk` blocks: their
+ * start and end IO positions and what genomic regions they cover.
+ * The value type of this container are containers of entries.
+ */
+class Index{
+private:
+ typedef Index self_type;
+ typedef Totempole::Footer footer_type;
+ typedef Totempole::IndexEntry value_type;
+ typedef Totempole::IndexContainer container_type;
+ typedef Totempole::IndexMetaContainer meta_container_type;
+ typedef Totempole::IndexMetaEntry meta_entry_type;
+ typedef value_type& reference;
+ typedef const value_type& const_reference;
+ typedef value_type* pointer;
+ typedef const value_type* const_pointer;
+ typedef std::ptrdiff_t difference_type;
+ typedef std::size_t size_type;
+ typedef IO::BasicBuffer buffer_type;
+ typedef IndexController controller_type;
+
+public:
+ Index();
+ ~Index();
+
+ // Reading an index from a byte stream
+ Index(const char* const data, const U32 l_data);
+
+ // Capacity
+ inline const size_type& size(void) const{ return(this->container_.size()); }
+ inline const size_type& sizeMeta(void) const{ return(this->meta_container_.size()); }
+
+ // Accessors
+ inline container_type& getContainer(void){ return(this->container_); }
+ inline const container_type& getContainer(void) const{ return(this->container_); }
+ inline meta_container_type& getMetaContainer(void){ return(this->meta_container_); }
+ inline const meta_container_type& getMetaContainer(void) const{ return(this->meta_container_); }
+ inline controller_type& getController(void){ return(this->controller_); }
+ inline const controller_type& getController(void) const{ return(this->controller_); }
+
+ // Setters
+ inline void setSorted(const bool yes){ this->controller_.isSorted = yes; }
+ inline void setPartialSorted(const bool yes){ this->controller_.isPartialSorted = yes; }
+
+ // Getters
+ inline const bool isSorted(void) const{ return(this->controller_.isSorted); }
+ inline const bool isPartialSorted(void) const{ return(this->controller_.isPartialSorted); }
+ inline const U64 totalBytes(void) const{
+ U64 total_bytes = 0;
+ for(size_t i = 0; i < this->getContainer().size(); ++i)
+ total_bytes += this->getContainer().at(i).sizeBytes();
+
+ return(total_bytes);
+ }
+
+ // Overloaded
+ inline void operator<<(const_reference entry){ this->container_ += entry; }
+ inline void operator+=(const_reference entry){ this->container_ += entry; }
+
+ /**<
+ * Constructs the index of index if the data is sorted
+ * @param n_contigs Number of contigs in the file
+ * @return Returns TRUE upon success or FALSE otherwise
+ */
+ bool buildMetaIndex(const U32 n_contigs);
+
+private:
+ friend std::ofstream& operator<<(std::ofstream& stream, const self_type& index){
+ stream << index.getController();
+ stream << index.getMetaContainer();
+ stream << index.getContainer();
+ return(stream);
+ }
+
+private:
+ controller_type controller_;
+ meta_container_type meta_container_;
+ container_type container_;
+};
+
+}
+
+#endif /* INDEX_INDEX_H_ */
diff --git a/src/index/index_container.cpp b/src/index/index_container.cpp
new file mode 100644
index 0000000..7bbdf40
--- /dev/null
+++ b/src/index/index_container.cpp
@@ -0,0 +1,143 @@
+#include "index_container.h"
+
+namespace Tomahawk{
+namespace Totempole{
+
+IndexContainer::IndexContainer(void) :
+ n_entries_(0),
+ n_capacity_(1000),
+ entries_(static_cast(::operator new[](this->capacity()*sizeof(value_type))))
+{
+
+}
+
+IndexContainer::IndexContainer(const size_t n_capacity_) :
+ n_entries_(0),
+ n_capacity_(n_capacity_),
+ entries_(static_cast(::operator new[](this->capacity()*sizeof(value_type))))
+{
+
+}
+
+// Functions for when interpreting from a byte stream
+// first value is the number of indices
+IndexContainer::IndexContainer(const char* const data_buffer, const U32 l_data) :
+ n_entries_(*reinterpret_cast(data_buffer)),
+ n_capacity_(this->n_entries_),
+ entries_(static_cast(::operator new[](this->capacity()*sizeof(value_type))))
+{
+ U32 cumulative_position = sizeof(size_type);
+ for(U32 i = 0; i < this->size(); ++i){
+ new( &this->entries_[i] ) value_type(&data_buffer[cumulative_position]);
+ cumulative_position += TWK_INDEX_ENTRY_SIZE;
+ }
+ assert(cumulative_position == l_data);
+}
+
+IndexContainer::~IndexContainer(){
+ for(size_type i = 0; i < this->size(); ++i)
+ ((this->entries_ + i)->~IndexEntry)();
+
+ ::operator delete[](static_cast(this->entries_));
+}
+
+IndexContainer& IndexContainer::operator+=(const value_type& index_entry){
+ if(this->size() + 1 >= this->capacity()){
+ //std::cerr << "is full resizing" << std::endl;
+ this->resize();
+ }
+
+ //std::cerr << Helpers::timestamp("DEBUG") << "Adding: " << this->size() << "/" << this->capacity() << std::endl;
+ new( &this->entries_[this->n_entries_] ) value_type(index_entry); // invoke copy ctor
+ ++this->n_entries_;
+ return(*this);
+}
+
+void IndexContainer::resize(const size_t new_capacity){
+ //std::cerr << Helpers::timestamp("DEBUG") << "Resize: " << this->capacity() << "->" << new_capacity << std::endl;
+ // if resizing to a smaller size
+ if(new_capacity < this->capacity()){
+ // Call destructor for values between shrunk size and previous numbers
+ for(size_type i = new_capacity; i < this->size(); ++i)
+ ((this->entries_ + i)->~IndexEntry)();
+
+ this->n_entries_ = new_capacity;
+ return;
+ }
+
+ pointer temp = this->entries_; // Move current data pointer
+ this->entries_ = static_cast(::operator new[](new_capacity*sizeof(value_type))); // Allocate new memory at old pointer
+ // Copy data over from temporary data pointer to new pointer
+ for(U32 i = 0; i < this->size(); ++i)
+ new( &this->entries_[i] ) value_type(temp[i]);
+
+ // Release memory from the temporary address
+ for(size_type i = 0; i < this->size(); ++i)
+ ((temp + i)->~IndexEntry)();
+
+ ::operator delete[](static_cast(temp));
+ this->n_capacity_ = new_capacity;
+}
+
+std::pair IndexContainer::findOverlap(const S32& contigID) const{
+ // Find first hit
+ size_t i = 0;
+ for(; i < this->size(); ++i){
+ if(this->at(i).overlaps(contigID))
+ break;
+ }
+
+ if(i == this->size())
+ return(std::pair(&this->at(this->size()), &this->at(this->size())));
+
+ const size_t from = i;
+ for(; i < this->size(); ++i){
+ if(this->at(i).overlaps(contigID) == false)
+ break;
+ }
+
+ return(std::pair(&this->at(from), &this->at(i)));
+}
+
+std::pair IndexContainer::findOverlap(const S32& contigID, const U64& position) const{
+ // Find first hit
+ size_t i = 0;
+ for(; i < this->size(); ++i){
+ if(this->at(i).overlaps(contigID, position))
+ break;
+ }
+
+ if(i == this->size())
+ return(std::pair(&this->at(this->size()), &this->at(this->size())));
+
+ const size_t from = i;
+ for(; i < this->size(); ++i){
+ if(this->at(i).overlaps(contigID, position) == false)
+ break;
+ }
+
+ return(std::pair(&this->at(from), &this->at(i)));
+}
+
+std::pair IndexContainer::findOverlap(const S32& contigID, const U64& from_position, const U64& to_position) const{
+ // Find first hit
+ size_t i = 0;
+ for(; i < this->size(); ++i){
+ if(this->at(i).overlaps(contigID, from_position, to_position))
+ break;
+ }
+
+ if(i == this->size())
+ return(std::pair(&this->at(this->size()), &this->at(this->size())));
+
+ const size_t from = i;
+ for(; i < this->size(); ++i){
+ if(this->at(i).overlaps(contigID, from_position, to_position) == false)
+ break;
+ }
+
+ return(std::pair(&this->at(from), &this->at(i)));
+}
+
+}
+}
diff --git a/src/index/index_container.h b/src/index/index_container.h
new file mode 100644
index 0000000..8951d26
--- /dev/null
+++ b/src/index/index_container.h
@@ -0,0 +1,128 @@
+#ifndef INDEX_INDEX_CONTAINER_H_
+#define INDEX_INDEX_CONTAINER_H_
+
+#include // assert
+#include // size_t, ptrdiff_t
+#include // forward_iterator_tag
+
+#include "../support/type_definitions.h"
+#include "../io/BasicBuffer.h"
+#include "index_entry.h"
+
+namespace Tomahawk{
+namespace Totempole{
+
+/**<
+ * STL-like container for Tomahawk index entries
+ */
+class IndexContainer{
+private:
+ typedef IndexContainer self_type;
+ typedef IndexEntry value_type;
+ typedef value_type& reference;
+ typedef const value_type& const_reference;
+ typedef value_type* pointer;
+ typedef const value_type* const_pointer;
+ typedef std::ptrdiff_t difference_type;
+ typedef std::size_t size_type;
+ typedef IO::BasicBuffer buffer_type;
+
+public:
+ IndexContainer(void);
+ IndexContainer(const size_t n_capacity_);
+
+ // Functions for when interpreting from a byte stream
+ // first value is the number of indices
+ IndexContainer(const char* const data_buffer, const U32 l_data);
+ ~IndexContainer();
+
+ class iterator{
+ private:
+ typedef iterator self_type;
+ typedef std::forward_iterator_tag iterator_category;
+
+ public:
+ iterator(pointer ptr) : ptr_(ptr) { }
+ void operator++() { ptr_++; }
+ void operator++(int junk) { ptr_++; }
+ reference operator*() const{ return *ptr_; }
+ pointer operator->() const{ return ptr_; }
+ bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; }
+ bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; }
+ private:
+ pointer ptr_;
+ };
+
+ class const_iterator{
+ private:
+ typedef const_iterator self_type;
+ typedef std::forward_iterator_tag iterator_category;
+
+ public:
+ const_iterator(pointer ptr) : ptr_(ptr) { }
+ void operator++() { ptr_++; }
+ void operator++(int junk) { ptr_++; }
+ const_reference operator*() const{ return *ptr_; }
+ const_pointer operator->() const{ return ptr_; }
+ bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; }
+ bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; }
+ private:
+ pointer ptr_;
+ };
+
+ // Element access
+ inline reference at(const size_type& position){ return(this->entries_[position]); }
+ inline const_reference at(const size_type& position) const{ return(this->entries_[position]); }
+ inline reference operator[](const size_type& position){ return(this->entries_[position]); }
+ inline const_reference operator[](const size_type& position) const{ return(this->entries_[position]); }
+ inline pointer data(void){ return(this->entries_); }
+ inline const_pointer data(void) const{ return(this->entries_); }
+ inline reference front(void){ return(this->entries_[0]); }
+ inline const_reference front(void) const{ return(this->entries_[0]); }
+ inline reference back(void){ return(this->entries_[this->n_entries_ - 1]); }
+ inline const_reference back(void) const{ return(this->entries_[this->n_entries_ - 1]); }
+
+ // Capacity
+ inline const bool empty(void) const{ return(this->n_entries_ == 0); }
+ inline const size_type& size(void) const{ return(this->n_entries_); }
+ inline const size_type& capacity(void) const{ return(this->n_capacity_); }
+
+ void resize(const size_t new_capacity);
+ inline void resize(void){ this->resize(this->capacity()*2); }
+
+ // Iterator
+ inline iterator begin(){ return iterator(&this->entries_[0]); }
+ inline iterator end() { return iterator(&this->entries_[this->n_entries_]); }
+ inline const_iterator begin() const{ return const_iterator(&this->entries_[0]); }
+ inline const_iterator end() const{ return const_iterator(&this->entries_[this->n_entries_]); }
+ inline const_iterator cbegin() const{ return const_iterator(&this->entries_[0]); }
+ inline const_iterator cend() const{ return const_iterator(&this->entries_[this->n_entries_]); }
+
+ // Overload basic operator
+ self_type& operator+=(const value_type& index_entry);
+
+ // Overlap functions: find blocks a target interval overlaps
+ // returns pairs of pointers. If pointerA == pointerB then the data is empty
+ std::pair findOverlap(const S32& contigID) const;
+ std::pair findOverlap(const S32& contigID, const U64& position) const;
+ std::pair findOverlap(const S32& contigID, const U64& from_position, const U64& to_position) const;
+
+private:
+ friend std::ofstream& operator<<(std::ofstream& stream, const self_type& container){
+ stream.write(reinterpret_cast(&container.n_entries_), sizeof(size_type));
+ for(size_type i = 0; i < container.size(); ++i)
+ stream << container[i];
+
+ return stream;
+ }
+
+private:
+ size_type n_entries_;
+ size_type n_capacity_;
+ pointer entries_;
+};
+
+}
+}
+
+#endif /* INDEX_INDEX_CONTAINER_H_ */
diff --git a/src/index/index_contig.h b/src/index/index_contig.h
new file mode 100644
index 0000000..4f9e08b
--- /dev/null
+++ b/src/index/index_contig.h
@@ -0,0 +1,135 @@
+#ifndef TOTEMPOLECONTIG_H_
+#define TOTEMPOLECONTIG_H_
+
+#include
+#include
+
+#include "../support/type_definitions.h"
+#include "../io/BasicBuffer.h"
+
+namespace Tomahawk{
+namespace Totempole{
+
+struct HeaderContig{
+public:
+ typedef HeaderContig self_type;
+ typedef IO::BasicBuffer buffer_type;
+
+public:
+ HeaderContig(const U32& bases, const U32& n_char, const std::string& name) :
+ n_bases(bases),
+ n_char(n_char),
+ name(name)
+ {}
+
+ HeaderContig() : n_bases(0), n_char(0){}
+
+ HeaderContig(const char* const data) :
+ n_bases(*reinterpret_cast(data)),
+ n_char(*reinterpret_cast(&data[sizeof(U32)]))
+ {
+ this->name.resize(this->n_char);
+ memcpy(&this->name[0], &data[sizeof(U32)+sizeof(U32)], this->n_char);
+ }
+
+ ~HeaderContig(){}
+
+ const U32 interpret(const char* const data){
+ this->n_bases = *reinterpret_cast(data);
+ this->n_char = *reinterpret_cast(&data[sizeof(U32)]);
+ this->name.resize(this->n_char);
+ memcpy(&this->name[0], &data[sizeof(U32)+sizeof(U32)], this->n_char);
+ return(sizeof(U32) + sizeof(U32) + this->n_char);
+ }
+
+ const U32 interpret(const U32& bases, const U32& n_char, const std::string& name){
+ this->n_bases = bases;
+ this->n_char = n_char;
+ this->name = name;
+ return(sizeof(U32) + sizeof(U32) + this->n_char);
+ }
+
+ friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){
+ stream << entry.n_bases << '\t' << entry.n_char << '\t' << entry.name;
+ return stream;
+ }
+
+ friend std::ofstream& operator<<(std::ofstream& stream, const self_type& base){
+ stream.write(reinterpret_cast(&base.n_bases), sizeof(U32));
+ stream.write(reinterpret_cast(&base.n_char), sizeof(U32));
+ stream.write(reinterpret_cast(&base.name[0]), base.name.size());
+ return(stream);
+ }
+
+ friend std::istream& operator>>(std::istream& stream, self_type& base){
+ stream.read(reinterpret_cast(&base.n_bases), sizeof(U32));
+ stream.read(reinterpret_cast(&base.n_char), sizeof(U32));
+ base.name.resize(base.n_char);
+ stream.read(&base.name[0], base.n_char);
+ return(stream);
+ }
+
+ friend buffer_type& operator+=(buffer_type& buffer, self_type& base){
+ buffer += base.n_bases;
+ buffer += base.n_char;
+ buffer.Add(base.name.data(), base.name.size());
+ return(buffer);
+ }
+
+public:
+ U32 n_bases; // length of contig
+ U32 n_char; // number of chars
+ std::string name; // contig name
+};
+
+struct IndexContig : public HeaderContig{
+public:
+ typedef IndexContig self_type;
+ typedef HeaderContig parent_type;
+
+public:
+ IndexContig() : min_position(0), max_position(0), blocks_start(0), blocks_end(0){}
+ ~IndexContig(){}
+
+ friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){
+ stream << entry.name << '\t' << entry.n_bases << '\t' << entry.min_position << "-" << entry.max_position << '\t' << entry.blocks_start << "->" << entry.blocks_end;
+ return stream;
+ }
+
+ friend std::ofstream& operator<<(std::ofstream& stream, const self_type& base){
+ const parent_type* const parent = reinterpret_cast(&base);
+ stream << *parent;
+
+ stream.write(reinterpret_cast(&base.min_position), sizeof(U32));
+ stream.write(reinterpret_cast(&base.max_position), sizeof(U32));
+ stream.write(reinterpret_cast(&base.blocks_start), sizeof(U32));
+ stream.write(reinterpret_cast(&base.blocks_end), sizeof(U32));
+ return(stream);
+ }
+
+ friend std::istream& operator>>(std::istream& stream, self_type& base){
+ parent_type* parent = reinterpret_cast(&base);
+ stream >> *parent;
+
+ stream.read(reinterpret_cast(&base.min_position), sizeof(U32));
+ stream.read(reinterpret_cast(&base.max_position), sizeof(U32));
+ stream.read(reinterpret_cast(&base.blocks_start), sizeof(U32));
+ stream.read(reinterpret_cast(&base.blocks_end), sizeof(U32));
+ return(stream);
+ }
+
+public:
+ // contigID is implicit
+ U32 min_position; // start position of contig
+ U32 max_position; // end position of contig
+ U32 blocks_start; // start IO-seek position of blocks
+ U32 blocks_end; // end IO-seek position of blocks
+};
+
+}
+}
+
+
+
+
+#endif /* TOTEMPOLECONTIG_H_ */
diff --git a/src/index/index_entry.h b/src/index/index_entry.h
new file mode 100644
index 0000000..4478271
--- /dev/null
+++ b/src/index/index_entry.h
@@ -0,0 +1,119 @@
+#ifndef TOTEMPOLEENTRY_H_
+#define TOTEMPOLEENTRY_H_
+
+#include
+
+namespace Tomahawk{
+namespace Totempole{
+
+#define TWK_INDEX_ENTRY_SIZE (sizeof(U64)*4 + sizeof(S32) + sizeof(U32)*2)
+
+struct IndexEntry{
+public:
+ typedef IndexEntry self_type;
+
+public:
+ IndexEntry() :
+ byte_offset(0),
+ byte_offset_end(0),
+ contigID(0),
+ min_position(0),
+ max_position(0),
+ n_variants(0),
+ uncompressed_size(0)
+ {
+ }
+
+ IndexEntry(const char* const data) :
+ byte_offset(*reinterpret_cast(data)),
+ byte_offset_end(*reinterpret_cast(&data[sizeof(U64)])),
+ contigID(*reinterpret_cast(&data[sizeof(U64)*2])),
+ min_position(*reinterpret_cast(&data[sizeof(U64)*2+sizeof(S32)])),
+ max_position(*reinterpret_cast(&data[sizeof(U64)*2+sizeof(S32)+sizeof(U64)])),
+ n_variants(*reinterpret_cast(&data[sizeof(U64)*2+sizeof(S32)+sizeof(U64)*2])),
+ uncompressed_size(*reinterpret_cast(&data[sizeof(U64)*2+sizeof(S32)+sizeof(U64)*2+sizeof(U32)]))
+ {
+ }
+
+ // Copy ctor
+ IndexEntry(const self_type& other) :
+ byte_offset(other.byte_offset),
+ byte_offset_end(other.byte_offset_end),
+ contigID(other.contigID),
+ min_position(other.min_position),
+ max_position(other.max_position),
+ n_variants(other.n_variants),
+ uncompressed_size(other.uncompressed_size)
+ {
+ }
+ ~IndexEntry() = default;
+
+ inline U32 size(void) const{ return(this->n_variants); }
+ inline bool isValid(void) const{ return(this->byte_offset != 0); }
+ inline void operator++(void){ ++this->n_variants; }
+ inline U64 sizeBytes(void) const{ return(this->byte_offset_end - this->byte_offset); }
+
+ friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){
+ stream << entry.byte_offset << '\t' << entry.byte_offset_end << '\t' << entry.contigID << '\t' << entry.min_position << '-' << entry.max_position << '\t' << entry.n_variants << '\t' << entry.uncompressed_size;
+ return stream;
+ }
+
+ friend std::ofstream& operator<<(std::ofstream& stream, const self_type& entry){
+ stream.write(reinterpret_cast(&entry.byte_offset), sizeof(U64));
+ stream.write(reinterpret_cast(&entry.byte_offset_end), sizeof(U64));
+ stream.write(reinterpret_cast(&entry.contigID), sizeof(S32));
+ stream.write(reinterpret_cast(&entry.min_position), sizeof(U64));
+ stream.write(reinterpret_cast(&entry.max_position), sizeof(U64));
+ stream.write(reinterpret_cast(&entry.n_variants), sizeof(U32));
+ stream.write(reinterpret_cast(&entry.uncompressed_size), sizeof(U32));
+ return stream;
+ }
+
+ friend std::istream& operator>>(std::istream& stream, self_type& entry){
+ stream.read(reinterpret_cast(&entry.byte_offset), sizeof(U64));
+ stream.read(reinterpret_cast(&entry.byte_offset_end), sizeof(U64));
+ stream.read(reinterpret_cast(&entry.contigID), sizeof(S32));
+ stream.read(reinterpret_cast(&entry.min_position), sizeof(U64));
+ stream.read(reinterpret_cast(&entry.max_position), sizeof(U64));
+ stream.read(reinterpret_cast(&entry.n_variants), sizeof(U32));
+ stream.read(reinterpret_cast(&entry.uncompressed_size), sizeof(U32));
+
+ return(stream);
+ }
+
+ void reset(void){
+ this->byte_offset = 0;
+ this->byte_offset_end = 0;
+ this->contigID = 0;
+ this->min_position = 0;
+ this->max_position = 0;
+ this->n_variants = 0;
+ this->uncompressed_size = 0;
+ }
+
+ inline const bool overlaps(const S32& contigID) const{ return(this->contigID == contigID); }
+ inline const bool overlaps(const S32& contigID, const U64& position) const{
+ if(this->contigID != contigID) return false;
+ return(position >= this->min_position && position <= this->max_position);
+ }
+ inline const bool overlaps(const S32& contigID, const U64& from_position, const U64& to_position) const{
+ if(this->contigID != contigID) return false;
+ if(from_position < this->min_position) return false;
+ if(to_position > this->max_position) return false;
+ return true;
+ }
+
+public:
+ U64 byte_offset; // tellg() position in stream for start of record in Tomahawk file
+ U64 byte_offset_end; // tellg() position in stream for start of record in Tomahawk file
+ S32 contigID; // contig identifier
+ U64 min_position; // smallest bp position in tomahawk block
+ U64 max_position; // largest bp position in tomahawk block
+ U32 n_variants; // number of variants in this block
+ U32 uncompressed_size; // uncompressed size of this block
+};
+
+}
+}
+
+#endif /* TOTEMPOLEENTRY_H_ */
diff --git a/src/index/index_meta_container.h b/src/index/index_meta_container.h
new file mode 100644
index 0000000..bad6d14
--- /dev/null
+++ b/src/index/index_meta_container.h
@@ -0,0 +1,187 @@
+#ifndef INDEX_INDEX_META_CONTAINER_H_
+#define INDEX_INDEX_META_CONTAINER_H_
+
+#include
+#include // size_t, ptrdiff_t
+#include // forward_iterator_tag
+
+#include "../support/type_definitions.h"
+#include "../io/BasicBuffer.h"
+#include "index_meta_entry.h"
+
+namespace Tomahawk{
+namespace Totempole{
+
+/**<
+ * STL-like container for Tomahawk meta index entries
+ */
+class IndexMetaContainer{
+private:
+ typedef IndexMetaContainer self_type;
+ typedef IndexMetaEntry value_type;
+ typedef value_type& reference;
+ typedef const value_type& const_reference;
+ typedef value_type* pointer;
+ typedef const value_type* const_pointer;
+ typedef std::ptrdiff_t difference_type;
+ typedef std::size_t size_type;
+ typedef IO::BasicBuffer buffer_type;
+
+public:
+ IndexMetaContainer(void) :
+ n_entries_(0),
+ n_capacity_(1000),
+ entries_(static_cast(::operator new[](this->capacity()*sizeof(value_type))))
+ {
+
+ }
+
+ IndexMetaContainer(const size_t n_capacity_) :
+ n_entries_(0),
+ n_capacity_(n_capacity_),
+ entries_(static_cast(::operator new[](this->capacity()*sizeof(value_type))))
+ {
+
+ }
+
+ // Functions for when interpreting from a byte stream
+ // first value is the number of indices
+ IndexMetaContainer(const char* const data_buffer, const U32 l_data) :
+ n_entries_(*reinterpret_cast(data_buffer)),
+ n_capacity_(this->n_entries_),
+ entries_(static_cast(::operator new[](this->capacity()*sizeof(value_type))))
+ {
+ U32 cumulative_position = sizeof(size_type);
+ for(U32 i = 0; i < this->size(); ++i){
+ new( &this->entries_[i] ) value_type(&data_buffer[cumulative_position]);
+ cumulative_position += TWK_INDEX_META_ENTRY_SIZE;
+ }
+ assert(cumulative_position == l_data);
+ }
+
+ ~IndexMetaContainer(){
+ for(size_type i = 0; i < this->size(); ++i)
+ ((this->entries_ + i)->~IndexMetaEntry)();
+
+ ::operator delete[](static_cast(this->entries_));
+ }
+
+ class iterator{
+ private:
+ typedef iterator self_type;
+ typedef std::forward_iterator_tag iterator_category;
+
+ public:
+ iterator(pointer ptr) : ptr_(ptr) { }
+ void operator++() { ptr_++; }
+ void operator++(int junk) { ptr_++; }
+ reference operator*() const{ return *ptr_; }
+ pointer operator->() const{ return ptr_; }
+ bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; }
+ bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; }
+ private:
+ pointer ptr_;
+ };
+
+ class const_iterator{
+ private:
+ typedef const_iterator self_type;
+ typedef std::forward_iterator_tag iterator_category;
+
+ public:
+ const_iterator(pointer ptr) : ptr_(ptr) { }
+ void operator++() { ptr_++; }
+ void operator++(int junk) { ptr_++; }
+ const_reference operator*() const{ return *ptr_; }
+ const_pointer operator->() const{ return ptr_; }
+ bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; }
+ bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; }
+ private:
+ pointer ptr_;
+ };
+
+ // Element access
+ inline reference at(const size_type& position){ return(this->entries_[position]); }
+ inline const_reference at(const size_type& position) const{ return(this->entries_[position]); }
+ inline reference operator[](const size_type& position){ return(this->entries_[position]); }
+ inline const_reference operator[](const size_type& position) const{ return(this->entries_[position]); }
+ inline pointer data(void){ return(this->entries_); }
+ inline const_pointer data(void) const{ return(this->entries_); }
+ inline reference front(void){ return(this->entries_[0]); }
+ inline const_reference front(void) const{ return(this->entries_[0]); }
+ inline reference back(void){ return(this->entries_[this->n_entries_ - 1]); }
+ inline const_reference back(void) const{ return(this->entries_[this->n_entries_ - 1]); }
+
+ // Capacity
+ inline const bool empty(void) const{ return(this->n_entries_ == 0); }
+ inline const size_type& size(void) const{ return(this->n_entries_); }
+ inline const size_type& capacity(void) const{ return(this->n_capacity_); }
+
+ // Iterator
+ inline iterator begin(){ return iterator(&this->entries_[0]); }
+ inline iterator end() { return iterator(&this->entries_[this->n_entries_]); }
+ inline const_iterator begin() const{ return const_iterator(&this->entries_[0]); }
+ inline const_iterator end() const{ return const_iterator(&this->entries_[this->n_entries_]); }
+ inline const_iterator cbegin() const{ return const_iterator(&this->entries_[0]); }
+ inline const_iterator cend() const{ return const_iterator(&this->entries_[this->n_entries_]); }
+
+ // Overload basic operator
+ self_type& operator+=(const value_type& index_entry){
+ if(this->size() + 1 >= this->capacity()){
+ //std::cerr << "is full resizing" << std::endl;
+ this->resize();
+ }
+
+ //std::cerr << Helpers::timestamp("DEBUG") << "Adding: " << this->size() << "/" << this->capacity() << std::endl;
+ new( &this->entries_[this->n_entries_] ) value_type(index_entry); // invoke copy ctor
+ ++this->n_entries_;
+ return(*this);
+ }
+
+ void resize(const size_t new_capacity){
+ //std::cerr << Helpers::timestamp("DEBUG") << "Resize: " << this->capacity() << "->" << new_capacity << std::endl;
+ // if resizing to a smaller size
+ if(new_capacity < this->capacity()){
+ // Call destructor for values between shrunk size and previous numbers
+ for(size_type i = new_capacity; i < this->size(); ++i)
+ ((this->entries_ + i)->~IndexMetaEntry)();
+
+ this->n_entries_ = new_capacity;
+ return;
+ }
+
+ pointer temp = this->entries_; // Move current data pointer
+ this->entries_ = static_cast(::operator new[](new_capacity*sizeof(value_type))); // Allocate new memory at old pointer
+ // Copy data over from temporary data pointer to new pointer
+ for(U32 i = 0; i < this->size(); ++i)
+ new( &this->entries_[i] ) value_type(temp[i]);
+
+ // Release memory from the temporary address
+ for(size_type i = 0; i < this->size(); ++i)
+ ((temp + i)->~IndexMetaEntry)();
+
+ ::operator delete[](static_cast(temp));
+ this->n_capacity_ = new_capacity;
+ }
+ inline void resize(void){ this->resize(this->capacity()*2); }
+
+private:
+ friend std::ofstream& operator<<(std::ofstream& stream, const self_type& container){
+ stream.write(reinterpret_cast(&container.n_entries_), sizeof(size_type));
+ for(size_type i = 0; i < container.size(); ++i)
+ stream << container[i];
+
+ return stream;
+ }
+
+private:
+ size_type n_entries_;
+ size_type n_capacity_;
+ pointer entries_;
+};
+
+}
+}
+
+
+#endif /* INDEX_INDEX_META_CONTAINER_H_ */
diff --git a/src/index/index_meta_entry.h b/src/index/index_meta_entry.h
new file mode 100644
index 0000000..9dc53f0
--- /dev/null
+++ b/src/index/index_meta_entry.h
@@ -0,0 +1,97 @@
+#ifndef INDEX_INDEX_META_ENTRY_H_
+#define INDEX_INDEX_META_ENTRY_H_
+
+namespace Tomahawk{
+namespace Totempole{
+
+#define TWK_INDEX_META_ENTRY_SIZE (sizeof(U64)*4 + sizeof(U32)*2)
+
+struct IndexMetaEntry{
+public:
+ typedef IndexMetaEntry self_type;
+
+public:
+ IndexMetaEntry() :
+ index_begin(0),
+ index_end(0),
+ min_position(0),
+ max_position(0),
+ n_variants(0),
+ uncompressed_size(0)
+ {
+ }
+
+ IndexMetaEntry(const char* const data) :
+ index_begin(*reinterpret_cast(data)),
+ index_end(*reinterpret_cast(&data[sizeof(U32)])),
+ min_position(*reinterpret_cast(&data[sizeof(U32)*2+sizeof(U64)])),
+ max_position(*reinterpret_cast(&data[sizeof(U32)*2+sizeof(U64)])),
+ n_variants(*reinterpret_cast(&data[sizeof(U32)*2+sizeof(U64)*2])),
+ uncompressed_size(*reinterpret_cast(&data[sizeof(U32)*2+sizeof(U64)*3]))
+ {
+ }
+
+ // Copy ctor
+ IndexMetaEntry(const self_type& other) :
+ index_begin(other.index_begin),
+ index_end(other.index_end),
+ min_position(other.min_position),
+ max_position(other.max_position),
+ n_variants(other.n_variants),
+ uncompressed_size(other.uncompressed_size)
+ {
+ }
+ ~IndexMetaEntry() = default;
+
+ inline U32 size(void) const{ return(this->n_variants); }
+ inline const bool empty(void) const{ return(this->n_variants == 0); }
+ inline void operator++(void){ ++this->n_variants; }
+
+ friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){
+ stream << entry.index_begin << "-" << entry.index_end << '\t' << entry.min_position << '-' << entry.max_position << '\t' << entry.n_variants << '\t' << entry.uncompressed_size;
+ return stream;
+ }
+
+ friend std::ofstream& operator<<(std::ofstream& stream, const self_type& entry){
+ stream.write(reinterpret_cast(&entry.index_begin), sizeof(U32));
+ stream.write(reinterpret_cast(&entry.index_end), sizeof(U32));
+ stream.write(reinterpret_cast(&entry.min_position), sizeof(U64));
+ stream.write(reinterpret_cast(&entry.max_position), sizeof(U64));
+ stream.write(reinterpret_cast(&entry.n_variants), sizeof(U64));
+ stream.write(reinterpret_cast(&entry.uncompressed_size), sizeof(U64));
+ return(stream);
+ }
+
+ friend std::istream& operator>>(std::istream& stream, self_type& entry){
+ stream.read(reinterpret_cast(&entry.index_begin), sizeof(U32));
+ stream.read(reinterpret_cast(&entry.index_end), sizeof(U32));
+ stream.read(reinterpret_cast(&entry.min_position), sizeof(U64));
+ stream.read(reinterpret_cast(&entry.max_position), sizeof(U64));
+ stream.read(reinterpret_cast(&entry.n_variants), sizeof(U64));
+ stream.read(reinterpret_cast(&entry.uncompressed_size), sizeof(U64));
+ return(stream);
+ }
+
+ void reset(void){
+ this->index_begin = 0;
+ this->index_end = 0;
+ this->min_position = 0;
+ this->max_position = 0;
+ this->n_variants = 0;
+ this->uncompressed_size = 0;
+ }
+
+public:
+ U32 index_begin;
+ U32 index_end;
+ U64 min_position; // smallest bp position in tomahawk block
+ U64 max_position; // largest bp position in tomahawk block
+ U64 n_variants; // number of variants in this block
+ U64 uncompressed_size; // uncompressed size of this block
+};
+
+}
+}
+
+
+#endif /* INDEX_INDEX_META_ENTRY_H_ */
diff --git a/src/index/tomahawk_header.cpp b/src/index/tomahawk_header.cpp
new file mode 100644
index 0000000..8b567f0
--- /dev/null
+++ b/src/index/tomahawk_header.cpp
@@ -0,0 +1,270 @@
+#include "tomahawk_header.h"
+
+namespace Tomahawk{
+
+TomahawkHeader::TomahawkHeader(void) :
+ contigs_(nullptr),
+ sample_names_(nullptr),
+ contigs_hash_table_(nullptr),
+ sample_hash_table_(nullptr)
+{
+
+}
+
+// Standard dtor
+TomahawkHeader::~TomahawkHeader(void){
+ delete [] this->contigs_;
+ delete [] this->sample_names_;
+ delete this->contigs_hash_table_;
+ delete this->sample_hash_table_;
+}
+
+// Open and close functions
+int TomahawkHeader::open(std::istream& stream){
+ if(stream.good() == false){
+ std::cerr << Helpers::timestamp("ERROR") << "Stream is bad!" << std::endl;
+ return(-1);
+ }
+
+ stream >> this->magic_;
+ if(this->validate() == false){
+ std::cerr << Helpers::timestamp("ERROR") << "Failed to validate MAGIC header!" << std::endl;
+ return(-2);
+ }
+
+ if(stream.good() == false){
+ std::cerr << Helpers::timestamp("ERROR") << "Stream is bad!" << std::endl;
+ return(-1);
+ }
+
+ // Parse literal block
+ compressor_type tgzf_controller(this->magic_.l_header_uncompressed + 1024);
+ buffer_type buffer(this->magic_.l_header + 1024);
+ buffer_type buffer_uncompressed(this->magic_.l_header_uncompressed + 1024);
+ stream.read(buffer.data(), this->magic_.l_header);
+ buffer.n_chars = this->magic_.l_header;
+
+ if(stream.good() == false){
+ std::cerr << Helpers::timestamp("ERROR") << "Stream is bad!" << std::endl;
+ return(-1);
+ }
+
+ if(!tgzf_controller.Inflate(buffer, buffer_uncompressed)){
+ std::cerr << Helpers::timestamp("ERROR", "TGZF") << "Failed to get deflate literal TGZF DATA!" << std::endl;
+ return(-3);
+ }
+
+ // Parse contigs
+ // Parse names
+ // Construct hash tables
+ U32 buffer_position = 0;
+
+ // Parse contigs
+ this->contigs_ = new contig_type[this->magic_.getNumberContigs()];
+ for(U32 i = 0; i < this->magic_.getNumberContigs(); ++i){
+ buffer_position += this->contigs_[i].interpret(&buffer_uncompressed[buffer_position]);
+ assert(buffer_position < buffer_uncompressed.size());
+ }
+
+ // Parse sample names
+ // Encoded as |length in characters|character buffer|
+ this->sample_names_ = new std::string[this->magic_.getNumberSamples()];
+ for(U32 i = 0; i < this->magic_.getNumberSamples(); ++i){
+ const U32 length = *reinterpret_cast(&buffer_uncompressed[buffer_position]);
+ buffer_position += sizeof(U32);
+
+ this->sample_names_[i] = std::string(&buffer_uncompressed[buffer_position], length);
+ buffer_position += length;
+ assert(buffer_position < buffer_uncompressed.size());
+ }
+
+ // Remainder is literal data
+ const U32 l_literals = buffer_uncompressed.size() - buffer_position;
+ this->literals_ = std::string(&buffer_uncompressed[buffer_position], l_literals);
+
+ // Build hash tables for contigs and sample names
+ if(this->BuildHashTables() == false){
+ std::cerr << Helpers::timestamp("ERROR") << "Cannot build hash tables" << std::endl;
+ return(-4);
+ }
+
+ // Buffer cleanup
+ buffer.deleteAll();
+ buffer_uncompressed.deleteAll();
+
+ return(1);
+}
+
+int TomahawkHeader::write(std::ostream& stream){
+ if(stream.good() == false){
+ std::cerr << Helpers::timestamp("ERROR") << "Stream is bad!" << std::endl;
+ return(-1);
+ }
+
+ // Compute uncompressed size
+ const U32 l_uncompressed_size = this->DetermineUncompressedSize();
+
+ buffer_type buffer(l_uncompressed_size + 1024);
+ for(U32 i = 0; i < this->magic_.getNumberContigs(); ++i){
+ //std::cerr << Helpers::timestamp("DEBUG") << this->contigs_[i] << std::endl;
+ buffer += this->contigs_[i];
+ }
+
+ for(U32 i = 0; i < this->magic_.getNumberSamples(); ++i){
+ buffer += (U32)this->sample_names_[i].size();
+ //std::cerr << Helpers::timestamp("DEBUG") << this->sample_names_[i] << std::endl;
+ buffer.Add(this->sample_names_[i].data(), this->sample_names_[i].size());
+ }
+
+ buffer.Add(this->literals_.data(), this->literals_.size());
+ this->magic_.l_header_uncompressed = buffer.size();
+ //std::cerr << buffer.size() << "\t" << l_uncompressed_size << std::endl;
+ assert(this->magic_.l_header_uncompressed == l_uncompressed_size);
+
+
+ compressor_type tgzf_controller(this->magic_.l_header_uncompressed + 1024);
+ if(!tgzf_controller.Deflate(buffer)){
+ std::cerr << Helpers::timestamp("ERROR", "TGZF") << "Failed to get deflate literal TGZF DATA!" << std::endl;
+ return(-3);
+ }
+
+ // Store compressed size
+ this->magic_.l_header = tgzf_controller.buffer.size();
+
+ stream << this->magic_;
+ if(stream.good() == false){
+ std::cerr << Helpers::timestamp("ERROR") << "Stream is bad!" << std::endl;
+ return(-1);
+ }
+ stream.write(tgzf_controller.buffer.data(), tgzf_controller.buffer.size());
+
+ //std::cerr << Helpers::timestamp("DEBUG") << this->magic_.l_header << "->" << this->magic_.l_header_uncompressed << '\t' << buffer.size() << "/" << buffer.capacity() << std::endl;
+
+ // Cleanup buffer
+ buffer.deleteAll();
+
+ return(1);
+}
+
+const bool TomahawkHeader::getSample(const std::string& sample_name, const std::string*& return_target) const{
+ if(this->sample_hash_table_ == nullptr)
+ return false;
+
+ if(sample_name.size() == 0)
+ return false;
+
+ if(this->sample_hash_table_->occupied() == 0)
+ return false;
+
+ S32* target = nullptr;
+ if(this->sample_hash_table_->GetItem(&sample_name[0], &sample_name, target, sample_name.length())){
+ return_target = &this->sample_names_[*target];
+ return true;
+ }
+ return false;
+}
+
+const bool TomahawkHeader::getContigName(const std::string& contig_name, const std::string*& return_target) const{
+ if(this->contigs_hash_table_ == nullptr)
+ return false;
+
+ if(contig_name.size() == 0)
+ return false;
+
+ if(this->contigs_hash_table_->occupied() == 0)
+ return false;
+
+ S32* target = nullptr;
+ if(this->contigs_hash_table_->GetItem(&contig_name[0], &contig_name, target, contig_name.length())){
+ return_target = &this->contigs_[*target].name;
+ return true;
+ }
+ return false;
+}
+
+const bool TomahawkHeader::getContig(const std::string& contig_name, const contig_type*& return_target) const{
+ if(this->contigs_hash_table_ == nullptr)
+ return false;
+
+ if(contig_name.size() == 0)
+ return false;
+
+ if(this->contigs_hash_table_->occupied() == 0)
+ return false;
+
+ S32* target = nullptr;
+ if(this->contigs_hash_table_->GetItem(&contig_name[0], &contig_name, target, contig_name.length())){
+ return_target = &this->contigs_[*target];
+ return true;
+ }
+ return false;
+}
+
+const S32 TomahawkHeader::getContigID(const std::string& contig_name) const{
+ if(this->contigs_hash_table_ == nullptr)
+ return false;
+
+ if(contig_name.size() == 0)
+ return false;
+
+ if(this->contigs_hash_table_->occupied() == 0)
+ return false;
+
+ S32* target = nullptr;
+ if(this->contigs_hash_table_->GetItem(&contig_name[0], &contig_name, target, contig_name.length())){
+ return(*target);
+ }
+ return(-1);
+}
+
+bool TomahawkHeader::BuildHashTables(void){
+ // For contigs
+ if(this->magic_.getNumberContigs() * 2 < 1024)
+ this->contigs_hash_table_ = new hash_table(1024);
+ else
+ this->contigs_hash_table_ = new hash_table(this->magic_.getNumberContigs() * 2);
+
+ S32* retValue = 0;
+ for(U32 i = 0; i < this->magic_.getNumberContigs(); ++i){
+ if(this->contigs_hash_table_->GetItem(&this->contigs_[i].name[0], &this->contigs_[i].name, retValue, this->contigs_[i].name.size())){
+ std::cerr << Helpers::timestamp("ERROR", "TOTEMPOLE") << "Duplicated contig! Impossible!" << std::endl;
+ return false;
+ }
+ this->contigs_hash_table_->SetItem(&this->contigs_[i].name[0], &this->contigs_[i].name, i, this->contigs_[i].name.size());
+ }
+
+ // For sample names
+ if(this->magic_.getNumberSamples() * 2 < 1024)
+ this->sample_hash_table_ = new hash_table(1024);
+ else
+ this->sample_hash_table_ = new hash_table(this->magic_.getNumberSamples() * 2);
+
+ retValue = 0;
+ for(U32 i = 0; i < this->magic_.getNumberSamples(); ++i){
+ if(this->sample_hash_table_->GetItem(&this->sample_names_[i][0], &this->sample_names_[i], retValue, this->sample_names_[i].size())){
+ std::cerr << Helpers::timestamp("ERROR", "TOTEMPOLE") << "Duplicated name! Impossible!" << std::endl;
+ return false;
+ }
+ this->sample_hash_table_->SetItem(&this->sample_names_[i][0], &this->sample_names_[i], i, this->sample_names_[i].size());
+ }
+
+ return true;
+}
+
+const U32 TomahawkHeader::DetermineUncompressedSize(void) const{
+ U32 l_uncompressed = 0;
+ for(U32 i = 0; i < this->magic_.getNumberContigs(); ++i){
+ l_uncompressed += this->contigs_[i].name.size() + sizeof(U32)*2;
+ }
+
+ for(U32 i = 0; i < this->magic_.getNumberSamples(); ++i){
+ l_uncompressed += this->sample_names_[i].size() + sizeof(U32);
+ }
+
+ l_uncompressed += this->literals_.size();
+
+ return(l_uncompressed);
+}
+
+
+}
diff --git a/src/index/tomahawk_header.h b/src/index/tomahawk_header.h
new file mode 100644
index 0000000..2ab2f17
--- /dev/null
+++ b/src/index/tomahawk_header.h
@@ -0,0 +1,70 @@
+#ifndef INDEX_TOMAHAWK_HEADER_H_
+#define INDEX_TOMAHAWK_HEADER_H_
+
+#include
+
+#include "../algorithm/open_hashtable.h"
+#include "index_contig.h"
+#include "../io/BasicBuffer.h"
+#include "../tomahawk/tomahawk_magic_header.h"
+#include "../io/compression/TGZFController.h"
+
+namespace Tomahawk{
+
+/**<
+ * This container handles the header data for
+ * a `twk` file
+ */
+class TomahawkHeader{
+public:
+ typedef TomahawkHeader self_type;
+ typedef Totempole::HeaderContig contig_type;
+ typedef Base::TomahawkMagicHeader magic_type;
+ typedef IO::BasicBuffer buffer_type;
+ typedef Hash::HashTable hash_table;
+ typedef IO::TGZFController compressor_type;
+
+public:
+ TomahawkHeader(void);
+
+ // Standard dtor
+ ~TomahawkHeader(void);
+
+ // Open and close functions
+ int open(std::istream& stream = std::cin);
+ int write(std::ostream& stream = std::cout);
+
+ // Accessors
+ inline std::string& getLiterals(void){ return(this->literals_); }
+ inline const std::string& getLiterals(void) const{ return(this->literals_); }
+ inline std::string& getSample(const U32 position){ return(this->sample_names_[position]); }
+ inline const std::string& getSample(const U32 position) const{ return(this->sample_names_[position]); }
+
+ const bool getSample(const std::string& sample_name, const std::string*& return_target) const;
+ const bool getContigName(const std::string& contig_name, const std::string*& return_target) const;
+ const bool getContig(const std::string& contig_name, const contig_type*& return_target) const;
+ const S32 getContigID(const std::string& contig_name) const;
+
+ inline magic_type& getMagic(void){ return(this->magic_); }
+ inline const magic_type& getMagic(void) const{ return(this->magic_); }
+
+ // Updater
+ inline void addLiteral(const std::string& string){ this->literals_ += string; }
+ inline const bool validate(void) const{ return(this->magic_.validate()); }
+
+private:
+ bool BuildHashTables(void);
+ const U32 DetermineUncompressedSize(void) const;
+
+public:
+ magic_type magic_; // magic header
+ std::string literals_; // literal data
+ contig_type* contigs_; // contig data
+ std::string* sample_names_; // sample names
+ hash_table* contigs_hash_table_; // contig name hash table
+ hash_table* sample_hash_table_; // sample name hash table
+};
+
+}
+
+#endif /* INDEX_TOMAHAWK_HEADER_H_ */
diff --git a/src/interface/ProgressBar.h b/src/interface/progressbar.h
similarity index 99%
rename from src/interface/ProgressBar.h
rename to src/interface/progressbar.h
index d52e066..85dde5e 100644
--- a/src/interface/ProgressBar.h
+++ b/src/interface/progressbar.h
@@ -9,7 +9,7 @@
#include
#include "../support/helpers.h"
-#include "Timer.h"
+#include "timer.h"
namespace Tomahawk{
namespace Interface{
diff --git a/src/interface/Timer.h b/src/interface/timer.h
similarity index 100%
rename from src/interface/Timer.h
rename to src/interface/timer.h
diff --git a/src/io/BasicBuffer.cpp b/src/io/BasicBuffer.cpp
deleted file mode 100644
index d39c17c..0000000
--- a/src/io/BasicBuffer.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * BasicBuffer.cpp
- *
- * Created on: 20 Feb 2017
- * Author: mk21
- */
-
-#include "BasicBuffer.h"
-
-namespace Tomahawk {
-namespace IO{
-
-}
-} /* namespace Tomahawk */
diff --git a/src/io/BasicBuffer.h b/src/io/BasicBuffer.h
index 8281acd..516b6cb 100644
--- a/src/io/BasicBuffer.h
+++ b/src/io/BasicBuffer.h
@@ -3,54 +3,105 @@
#include
#include
-#include "../support/TypeDefinitions.h"
+#include "../support/type_definitions.h"
#include "../support/helpers.h"
namespace Tomahawk {
namespace IO{
struct BasicBuffer{
- typedef BasicBuffer self_type;
-
- BasicBuffer() : pointer(0), width(0), data(nullptr){}
- BasicBuffer(const U64 size) : pointer(0), width(size), data(new char[size]){}
- BasicBuffer(char* target, const size_t length) : pointer(length), width(length), data(target){}
- BasicBuffer(const U64 size, char* target) : pointer(0), width(size), data(target){}
- BasicBuffer(const self_type& other) : pointer(0), width(other.width), data(new char[other.width]){}
+private:
+ typedef BasicBuffer self_type;
+ typedef char value_type;
+ typedef value_type& reference;
+ typedef const value_type& const_reference;
+ typedef value_type* pointer;
+ typedef const value_type* const_pointer;
+ typedef std::ptrdiff_t difference_type;
+ typedef std::size_t size_type;
+
+public:
+ BasicBuffer(void) : n_chars(0), width(0), buffer(nullptr){}
+ BasicBuffer(const U64 size) : n_chars(0), width(size), buffer(new value_type[size]){}
+ BasicBuffer(pointer target, const size_t length) : n_chars(length), width(length), buffer(target){}
+ BasicBuffer(const U64 size, pointer target) : n_chars(0), width(size), buffer(target){}
+ BasicBuffer(const self_type& other) : n_chars(0), width(other.width), buffer(new value_type[other.width]){}
virtual ~BasicBuffer(){}
+ class iterator{
+ private:
+ typedef iterator self_type;
+ typedef std::forward_iterator_tag iterator_category;
+
+ public:
+ iterator(pointer ptr) : ptr_(ptr) { }
+ void operator++() { ptr_++; }
+ void operator++(int junk) { ptr_++; }
+ reference operator*() const{ return *ptr_; }
+ pointer operator->() const{ return ptr_; }
+ bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; }
+ bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; }
+ private:
+ pointer ptr_;
+ };
+
+ class const_iterator{
+ private:
+ typedef const_iterator self_type;
+ typedef std::forward_iterator_tag iterator_category;
+
+ public:
+ const_iterator(pointer ptr) : ptr_(ptr) { }
+ void operator++() { ptr_++; }
+ void operator++(int junk) { ptr_++; }
+ const_reference operator*() const{ return *ptr_; }
+ const_pointer operator->() const{ return ptr_; }
+ bool operator==(const self_type& rhs) const{ return ptr_ == rhs.ptr_; }
+ bool operator!=(const self_type& rhs) const{ return ptr_ != rhs.ptr_; }
+ private:
+ pointer ptr_;
+ };
+
+ // Iterator
+ inline iterator begin(){ return iterator(&this->buffer[0]); }
+ inline iterator end() { return iterator(&this->buffer[this->n_chars - 1]); }
+ inline const_iterator begin() const{ return const_iterator(&this->buffer[0]); }
+ inline const_iterator end() const{ return const_iterator(&this->buffer[this->n_chars - 1]); }
+ inline const_iterator cbegin() const{ return const_iterator(&this->buffer[0]); }
+ inline const_iterator cend() const{ return const_iterator(&this->buffer[this->n_chars - 1]); }
+
inline void set(const size_t size){
- this->pointer = 0;
+ this->n_chars = 0;
this->width = size;
- if(this->data != nullptr)
- delete [] this->data;
+ if(this->buffer != nullptr)
+ delete [] this->buffer;
- this->data = new char[size];
+ this->buffer = new char[size];
}
- inline void deleteAll(void){ delete [] this->data; } // manual cleaup
+ inline void deleteAll(void){ delete [] this->buffer; } // manual cleaup
inline void set(const size_t size, char* target){
- this->pointer = 0;
+ this->n_chars = 0;
this->width = size;
- this->data = target;
+ this->buffer = target;
}
inline virtual void set(char* target){
- this->pointer = 0;
+ this->n_chars = 0;
this->width = 0;
- this->data = target;
+ this->buffer = target;
}
- inline void reset(){ this->pointer = 0; }
- inline void move(const U64 to){ this->pointer = to; }
- inline const U64& size(void) const{ return this->pointer; }
+ inline void reset(){ this->n_chars = 0; }
+ inline void move(const U64 to){ this->n_chars = to; }
+ inline const U64& size(void) const{ return this->n_chars; }
inline const U64& capacity(void) const{ return this->width; }
void resize(const U64 new_size){
if(new_size <= this->capacity()){
if(new_size < this->size())
- this->pointer = new_size;
+ this->n_chars = new_size;
return;
}
@@ -58,13 +109,13 @@ struct BasicBuffer{
U64 copy_to = this->size();
if(new_size < this->size()){
copy_to = new_size;
- this->pointer = copy_to;
+ this->n_chars = copy_to;
}
- //std::cerr << Helpers::timestamp("DEBUG") << "Resizing buffer: " << this->capacity() << " -> " << new_size << "\tcopyto: " << copy_to << std::endl;
- char* target = this->data;
- this->data = new char[new_size];
- memcpy(&this->data[0], &target[0], copy_to);
+ //std::cerr << utility::timestamp("DEBUG") << "Resizing buffer: " << this->capacity() << " -> " << new_size << "\tcopyto: " << copy_to << std::endl;
+ char* target = this->buffer;
+ this->buffer = new char[new_size];
+ memcpy(&this->buffer[0], &target[0], copy_to);
delete [] target;
this->width = new_size;
}
@@ -79,137 +130,189 @@ struct BasicBuffer{
if(this->size() + length >= this->capacity())
this->resize((this->size() + length) * 2);
- memcpy(&this->data[this->pointer], &data[0], length);
- this->pointer += length;
+ memcpy(&this->buffer[this->n_chars], &data[0], length);
+ this->n_chars += length;
+ }
+
+ void AddReadble(const SBYTE& value){
+ const int ret = sprintf(&this->buffer[this->n_chars], "%d", value);
+ this->n_chars += ret;
+ }
+
+ void AddReadble(const S16& value){
+ const int ret = sprintf(&this->buffer[this->n_chars], "%d", value);
+ this->n_chars += ret;
+ }
+
+ void AddReadble(const S32& value){
+ const int ret = sprintf(&this->buffer[this->n_chars], "%d", value);
+ this->n_chars += ret;
+ }
+
+ void AddReadble(const BYTE& value){
+ const int ret = sprintf(&this->buffer[this->n_chars], "%u", value);
+ this->n_chars += ret;
+ }
+
+ void AddReadble(const U16& value){
+ const int ret = sprintf(&this->buffer[this->n_chars], "%u", value);
+ this->n_chars += ret;
+ }
+
+ void AddReadble(const U32& value){
+ const int ret = sprintf(&this->buffer[this->n_chars], "%u", value);
+ this->n_chars += ret;
+ }
+
+ void AddReadble(const U64& value){
+ const int ret = sprintf(&this->buffer[this->n_chars], "%llu", value);
+ this->n_chars += ret;
+ }
+
+ void AddReadble(const float& value){
+ const int ret = sprintf(&this->buffer[this->n_chars], "%g", value);
+ this->n_chars += ret;
+ }
+
+ void AddReadble(const double& value){
+ const int ret = sprintf(&this->buffer[this->n_chars], "%g", value);
+ this->n_chars += ret;
}
inline self_type& operator+=(const self_type& other){
if(this->size() + other.size() >= this->capacity())
this->resize((this->size() + other.size()) * 2);
- memcpy(&this->data[this->pointer], other.data, other.pointer);
- this->pointer += other.pointer;
+ memcpy(&this->buffer[this->n_chars], other.buffer, other.n_chars);
+ this->n_chars += other.n_chars;
return *this;
}
inline self_type& operator+=(const char& value){
- if(this->pointer + sizeof(char) >= this->width)
+ if(this->n_chars + sizeof(char) >= this->width)
this->resize(this->width*2);
- this->data[this->pointer] = value;
- ++this->pointer;
+ this->buffer[this->n_chars] = value;
+ ++this->n_chars;
return *this;
}
inline self_type& operator+=(const BYTE& value){
- if(this->pointer + sizeof(BYTE) >= this->width)
+ if(this->n_chars + sizeof(BYTE) >= this->width)
this->resize(this->width*2);
- BYTE* p = reinterpret_cast(&this->data[this->pointer]);
+ BYTE* p = reinterpret_cast(&this->buffer[this->n_chars]);
*p = value;
- this->pointer += sizeof(BYTE);
+ this->n_chars += sizeof(BYTE);
return *this;
}
inline self_type& operator+=(const float& value){
- if(this->pointer + sizeof(float) >= this->width)
+ if(this->n_chars + sizeof(float) >= this->width)
this->resize(this->width*2);
- float* p = reinterpret_cast(&this->data[this->pointer]);
+ float* p = reinterpret_cast(&this->buffer[this->n_chars]);
*p = value;
- this->pointer += sizeof(float);
+ this->n_chars += sizeof(float);
return *this;
}
inline self_type& operator+=(const U16 value){
- if(this->pointer + sizeof(U16) >= this->width)
+ if(this->n_chars + sizeof(U16) >= this->width)
this->resize(this->width*2);
- U16* p = reinterpret_cast(&this->data[this->pointer]);
+ U16* p = reinterpret_cast(&this->buffer[this->n_chars]);
*p = value;
- this->pointer += sizeof(U16);
+ this->n_chars += sizeof(U16);
return *this;
}
inline self_type& operator+=(const short& value){
- if(this->pointer + sizeof(short) >= this->width)
+ if(this->n_chars + sizeof(short) >= this->width)
this->resize(this->width*2);
- short* p = reinterpret_cast(&this->data[this->pointer]);
+ short* p = reinterpret_cast(&this->buffer[this->n_chars]);
*p = value;
- this->pointer += sizeof(short);
+ this->n_chars += sizeof(short);
return *this;
}
inline self_type& operator+=(const U32& value){
- if(this->pointer + sizeof(U32) >= this->width)
+ if(this->n_chars + sizeof(U32) >= this->width)
this->resize(this->width*2);
- U32* p = reinterpret_cast(&this->data[this->pointer]);
+ U32* p = reinterpret_cast(&this->buffer[this->n_chars]);
*p = value;
- this->pointer += sizeof(U32);
+ this->n_chars += sizeof(U32);
return *this;
}
inline self_type& operator+=(const S32& value){
- if(this->pointer + sizeof(S32) >= this->width)
+ if(this->n_chars + sizeof(S32) >= this->width)
this->resize(this->width*2);
- S32* p = reinterpret_cast(&this->data[this->pointer]);
+ S32* p = reinterpret_cast(&this->buffer[this->n_chars]);
*p = value;
- this->pointer += sizeof(S32);
+ this->n_chars += sizeof(S32);
return *this;
}
inline self_type& operator+=(const double& value){
- if(this->pointer + sizeof(double) >= this->width)
+ if(this->n_chars + sizeof(double) >= this->width)
this->resize(this->width*2);
- double* p = reinterpret_cast(&this->data[this->pointer]);
+ double* p = reinterpret_cast(&this->buffer[this->n_chars]);
*p = value;
- this->pointer += sizeof(double);
+ this->n_chars += sizeof(double);
return *this;
}
inline self_type& operator+=(const U64& value){
- if(this->pointer + sizeof(U64) >= this->width)
+ if(this->n_chars + sizeof(U64) >= this->width)
this->resize(this->width*2);
- U64* p = reinterpret_cast(&this->data[this->pointer]);
+ U64* p = reinterpret_cast(&this->buffer[this->n_chars]);
*p = value;
- this->pointer += sizeof(U64);
+ this->n_chars += sizeof(U64);
return *this;
}
inline self_type& operator+=(const std::string& value){
- if(this->pointer + value.size() + sizeof(BYTE) >= this->width){
+ if(this->n_chars + value.size() + sizeof(BYTE) >= this->width){
U64 resize_to = this->width * 2;
- while(this->pointer + value.size() + sizeof(BYTE) >= resize_to)
+ while(this->n_chars + value.size() + sizeof(BYTE) >= resize_to)
resize_to *= 2;
this->resize(resize_to);
}
for(U32 i = 0; i < value.size(); ++i){
- this->data[this->pointer] = value[i];
- ++this->pointer;
+ this->buffer[this->n_chars] = value[i];
+ ++this->n_chars;
}
return *this;
}
- char& operator[](const U64 size){ return this->data[size]; }
- const char& operator[](const U64 size) const{ return this->data[size]; }
+ inline reference operator[](const U64 position){ return this->buffer[position]; }
+ inline const_reference operator[](const U64 position) const{ return this->buffer[position]; }
+ inline reference at(const U64 position){ return this->buffer[position]; }
+ inline const_reference at(const U64 position) const{ return this->buffer[position]; }
+ inline pointer data(void){ return(this->buffer); }
+ inline const_pointer data(void) const{ return(this->buffer); }
+
+private:
friend std::ostream& operator<<(std::ostream& out, const self_type& data){
- out.write(data.data, data.pointer);
+ out.write(data.buffer, data.n_chars);
return(out);
}
- U64 pointer;
- U64 width;
- char* data;
+public:
+ U64 n_chars;
+ U64 width;
+ pointer buffer;
};
} /* namespace IO */
diff --git a/src/io/BasicWriters.h b/src/io/BasicWriters.h
index a3c8024..9cb735b 100644
--- a/src/io/BasicWriters.h
+++ b/src/io/BasicWriters.h
@@ -15,7 +15,7 @@ namespace IO{
class GenericWriterInterace {
protected:
- typedef IO::BasicBuffer buffer_type;
+ typedef IO::BasicBuffer buffer_type;
typedef Algorithm::SpinLock lock_type;
public:
@@ -39,8 +39,8 @@ class GenericWriterInterace {
virtual bool close(void) =0;
inline lock_type* getLock(void){ return(&this->lock); }
- virtual inline const size_t writeNoLock(const char* data, const U32 length) =0;
- virtual inline const size_t writeNoLock(const buffer_type& buffer) =0;
+ virtual const size_t writeNoLock(const char* data, const U32 length) =0;
+ virtual const size_t writeNoLock(const buffer_type& buffer) =0;
protected:
lock_type lock;
@@ -76,8 +76,8 @@ class WriterStandardOut : public GenericWriterInterace{
}
inline const size_t writeNoLock(const buffer_type& buffer){
- std::cout.write(&buffer.data[0], buffer.pointer);
- return(buffer.pointer);
+ std::cout.write(buffer.data(), buffer.size());
+ return(buffer.size());
}
void operator<<(void* entry){}
@@ -87,7 +87,7 @@ class WriterStandardOut : public GenericWriterInterace{
// Note that this threads enter here at random
// Extremely unlikely there is every any contention
this->lock.lock();
- std::cout.write(&buffer.data[0], buffer.size());
+ std::cout.write(buffer.data(), buffer.size());
this->lock.unlock();
}
};
@@ -108,12 +108,15 @@ class WriterFile : public GenericWriterInterace{
}
bool open(const std::string output){
+ std::cerr << "here in open: " << output << std::endl;
if(output.length() == 0){
std::cerr << Helpers::timestamp("ERROR", "WRITER") << "No output name provided..." << std::endl;
return false;
}
+ std::cerr << "after test" << std::endl;
this->stream.open(output, std::ios::binary | std::ios::out);
+ std::cerr << "after first open" << std::endl;
if(!this->stream.good()){
std::cerr << Helpers::timestamp("ERROR", "WRITER") << "Could not open output file: " << output << "..." << std::endl;
return false;
@@ -122,6 +125,8 @@ class WriterFile : public GenericWriterInterace{
if(!SILENT)
std::cerr << Helpers::timestamp("LOG", "WRITER") << "Opening output file: " << output << "..." << std::endl;
+ std::cerr << "returnign open" << std::endl;
+ std::cerr << this->stream.good() << std::endl;
return true;
}
@@ -130,16 +135,22 @@ class WriterFile : public GenericWriterInterace{
inline void flush(void){ this->stream.flush(); }
inline bool close(void){ this->stream.close(); return true; }
+ template
+ void operator<<(const Y& value){
+ this->stream << value;
+ }
+
void operator<<(const buffer_type& buffer){
// Mutex lock; write; unlock
// Note that this threads enter here at random
// Extremely unlikely there is every any contention
this->lock.lock();
- this->stream.write(&buffer.data[0], buffer.size());
+ this->stream.write(buffer.data(), buffer.size());
this->lock.unlock();
}
void operator<<(void* entry){}
+
const size_t write(const char* data, const U64& length){
this->lock.lock();
this->stream.write(&data[0], length);
@@ -153,12 +164,12 @@ class WriterFile : public GenericWriterInterace{
}
inline const size_t writeNoLock(const buffer_type& buffer){
- this->stream.write(&buffer.data[0], buffer.pointer);
- return(buffer.pointer);
+ this->stream.write(buffer.data(), buffer.size());
+ return(buffer.size());
}
private:
- std::string outFile;
+ std::string outFile;
std::ofstream stream;
};
diff --git a/src/io/PackedEntryReader.h b/src/io/PackedEntryReader.h
deleted file mode 100644
index e3c7bd5..0000000
--- a/src/io/PackedEntryReader.h
+++ /dev/null
@@ -1,168 +0,0 @@
-#ifndef PACKEDENTRYREADER_H_
-#define PACKEDENTRYREADER_H_
-
-namespace Tomahawk{
-namespace IO{
-
-#define PACKED_READER_DEFAULT_CHUNK 1000000 // 1MB
-
-template
-class PackedEntryReader{
- typedef TomahawkOutputEntry entry_type;
-
-public:
- PackedEntryReader();
- virtual ~PackedEntryReader();
-
- bool setup(const std::string file, size_t chunk_size = PACKED_READER_DEFAULT_CHUNK - PACKED_READER_DEFAULT_CHUNK % Y);
- bool nextEntry(const entry_type*& entry);
- virtual bool nextBlock(void);
-
- inline bool seek(const size_t pos);
- inline entry_type* begin(void){ return(this->entries); }
- inline entry_type* end(void){ return(&this->entries[this->entry_tail-1]); }
- inline entry_type* operator[](const U32& p){ return(&this->entries[p]); }
- inline const size_t& size(void) const{ return this->entry_tail; }
- inline const size_t& size_buffer(void) const{ return this->buffer_size; }
- inline void reset(void){ this->entry_head = 0; this->entry_tail = 0; }
- inline void next(void){ ++this->entry_head; }
- inline void prev(void){ --this->entry_head; }
- inline bool available(void) const{ return(this->entry_head < this->entry_tail); }
- inline bool good(void) const{ return(this->stream.good()); }
- inline const size_t& filesize(void) const{ return this->__filesize; }
- inline size_t tellg(void){ return this->stream.tellg(); }
- inline const size_t& block_size(void) const{ return this->read_block_size; }
-
-protected:
- bool open(const std::string& file);
-
-protected:
- size_t __filesize;
- size_t entry_head;
- size_t entry_tail;
- size_t buffer_size;
- size_t read_block_size;
- std::ifstream stream;
- char* buffer;
- entry_type* entries;
-};
-
-template
-PackedEntryReader::PackedEntryReader()
- : __filesize(0)
- , entry_head(0)
- , entry_tail(0)
- , buffer_size(0)
- , read_block_size(0)
- , buffer(nullptr)
- , entries(nullptr)
-{}
-
-template
-PackedEntryReader::~PackedEntryReader(){
- delete [] this->buffer;
-}
-
-template
-bool PackedEntryReader::setup(const std::string file, size_t chunk_size){
- if(!this->open(file)){
- std::cerr << Helpers::timestamp("ERROR", "IO") << "Failed to open file..." << std::endl;
- return false;
- }
-
- if(chunk_size == 0){
- std::cerr << Helpers::timestamp("ERROR", "IO") << "Illegal chunk size" << std::endl;
- return false;
- }
-
- if(chunk_size % Y != 0){
- std::cerr << "Adjusting chunk size: " << chunk_size << " -> ";
- chunk_size -= chunk_size % Y;
- std::cerr << chunk_size << std::endl;
- if(chunk_size == 0){
- std::cerr << "illegal chunk size" << std::endl;
- return false;
- }
- }
-
- this->read_block_size = chunk_size;
-
- this->reset();
- delete [] this->buffer;
- this->buffer = new char[this->read_block_size];
- this->buffer_size = this->read_block_size;
-
- return true;
-}
-
-template
-bool PackedEntryReader::open(const std::string& file){
- this->stream.open(file, std::ios::binary | std::ios::in | std::ios::ate);
- if(!this->good()){
- std::cerr << Helpers::timestamp("ERROR", "IO") << "IO-stream is bad..." << std::endl;
- return false;
- }
-
- this->__filesize = this->stream.tellg();
- this->stream.seekg(0);
-
- return true;
-}
-
-template
-bool PackedEntryReader::nextEntry(const entry_type*& entry){
- if(!this->available()){
- if(!this->nextBlock())
- return false;
- }
-
- entry = &this->entries[this->entry_head];
- this->next();
- return true;
-}
-
-template
-bool PackedEntryReader::nextBlock(void){
- if(!this->good()){
- std::cerr << Helpers::timestamp("ERROR", "IO") << "IO-stream has failed..." << std::endl;
- return false;
- }
-
- if(this->stream.tellg() == this->filesize())
- return false;
-
- // Ignore if unset
- // comparison does not happen if tellg() == -1, return above
- this->reset();
-
- size_t readAmount = this->read_block_size;
- if((U64)this->stream.tellg() + this->read_block_size > this->filesize())
- readAmount = this->filesize() - this->stream.tellg();
-
- this->stream.read(this->buffer, readAmount);
- const U32 entries_read = this->stream.gcount() / Y;
- this->buffer_size = this->stream.gcount();
- if(this->stream.gcount() % Y != 0){
- std::cerr << Helpers::timestamp("ERROR", "IO") << "block is staggered" << std::endl;
- return false;
- }
-
- this->entry_tail = entries_read;
- this->entries = reinterpret_cast(this->buffer);
-
- return true;
-}
-
-template
-bool PackedEntryReader::seek(const size_t pos){
- this->stream.seekg(pos);
- this->reset(); // trigger reloading data when asking for next entry
- return(this->good());
-}
-
-}
-}
-
-
-
-#endif /* PACKEDENTRYREADER_H_ */
diff --git a/src/io/bcf/BCFEntry.cpp b/src/io/bcf/BCFEntry.cpp
index 7d953fa..537f3c9 100644
--- a/src/io/bcf/BCFEntry.cpp
+++ b/src/io/bcf/BCFEntry.cpp
@@ -10,7 +10,7 @@ namespace Tomahawk {
namespace BCF {
BCFEntry::BCFEntry(void):
- pointer(0),
+ l_data(0),
limit(262144),
l_ID(0),
p_genotypes(0),
@@ -30,7 +30,7 @@ BCFEntry::~BCFEntry(void){ delete [] this->data; }
void BCFEntry::resize(const U32 size){
char* temp = this->data;
this->data = new char[size];
- memcpy(this->data, temp, this->pointer);
+ memcpy(this->data, temp, this->l_data);
std::swap(temp, this->data);
delete [] temp;
this->body = reinterpret_cast(this->data);
@@ -40,11 +40,11 @@ void BCFEntry::resize(const U32 size){
}
void BCFEntry::add(const char* const data, const U32 length){
- if(this->pointer + length > this-> capacity())
- this->resize(this->pointer + length + 65536);
+ if(this->l_data + length > this-> capacity())
+ this->resize(this->l_data + length + 65536);
- memcpy(&this->data[this->pointer], data, length);
- this->pointer += length;
+ memcpy(&this->data[this->l_data], data, length);
+ this->l_data += length;
}
void BCFEntry::__parseID(U32& internal_pos){
@@ -126,25 +126,14 @@ bool BCFEntry::parse(void){
// Format key
const base_type& fmt_type = *reinterpret_cast(&this->data[internal_pos++]);
- //std::cerr << "fmt_key:" << (int)fmt_key_value << '\t' << "fmt_type: " << (int)fmt_type.high << '\t' << (int)fmt_type.low << std::endl;
- //std::cerr << (int)fmt_type_value2 << '\t' << (int)fmt_type_value1 << std::endl;
- //assert(fmt_type.high == 2);
if(fmt_type.high != 2){
this->isGood = false;
return false;
}
- this->isGood = true;
-
- /*
- for(U32 i = 0; i < 44; ++i){
- const SBYTE& fmt_type_value1 = *reinterpret_cast(&this->data[internal_pos++]);
- const SBYTE& fmt_type_value2 = *reinterpret_cast(&this->data[internal_pos++]);
- std::cerr << i << ':' << " " << (int)fmt_type_value1 << ',' << (int)fmt_type_value2 << '\t' << (int)(BCF::BCF_UNPACK_GENOTYPE(fmt_type_value1)) << ',' << (int)(BCF::BCF_UNPACK_GENOTYPE(fmt_type_value2)) << std::endl;
- }
- */
- this->genotypes = &this->data[internal_pos];
+ this->isGood = true;
+ this->genotypes = &this->data[internal_pos];
this->p_genotypes = internal_pos;
return true;
diff --git a/src/io/bcf/BCFEntry.h b/src/io/bcf/BCFEntry.h
index 996f37f..9add53f 100644
--- a/src/io/bcf/BCFEntry.h
+++ b/src/io/bcf/BCFEntry.h
@@ -10,28 +10,24 @@ const BYTE BCF_UNPACK_TOMAHAWK[3] = {2, 0, 1};
#define BCF_UNPACK_GENOTYPE(A) BCF_UNPACK_TOMAHAWK[(A >> 1)]
-#pragma pack(1)
-struct BCFAtomicBase{
+#pragma pack(push, 1)
+struct __attribute__((packed, aligned(1))) BCFAtomicBase{
BYTE low: 4, high: 4;
};
-#pragma pack(1)
-struct BCFAtomicSBYTE{
+struct __attribute__((packed, aligned(1))) BCFAtomicSBYTE{
SBYTE low: 4, high: 4;
};
-#pragma pack(1)
-struct BCFAtomicS16{
+struct __attribute__((packed, aligned(1))) BCFAtomicS16{
S16 low: 4, high: 12;
};
-#pragma pack(1)
-struct BCFAtomicS32{
+struct __attribute__((packed, aligned(1))) BCFAtomicS32{
S32 low: 4, high: 28;
};
-#pragma pack(1)
-struct BCFEntryBody{
+struct __attribute__((packed, aligned(1))) BCFEntryBody{
typedef BCFEntryBody self_type;
BCFEntryBody(); // disallow ctor and dtor
@@ -62,6 +58,8 @@ struct BCFEntryBody{
U32 n_sample: 8, n_fmt: 24;
};
+#pragma pack(pop)
+
struct BCFTypeString{
typedef BCFAtomicBase base_type;
@@ -70,23 +68,34 @@ struct BCFTypeString{
};
struct BCFEntry{
+public:
+ typedef BCFEntry self_type;
typedef IO::BasicBuffer buffer_type;
- typedef BCFEntryBody body_type;
- typedef BCFTypeString string_type;
- typedef BCFAtomicBase base_type;
+ typedef BCFEntryBody body_type;
+ typedef BCFTypeString string_type;
+ typedef BCFAtomicBase base_type;
+public:
BCFEntry(void);
~BCFEntry(void);
void resize(const U32 size);
void add(const char* const data, const U32 length);
- inline void reset(void){ this->pointer = 0; this->isGood = false; }
- inline const U32& size(void) const{ return(this->pointer); }
+ inline void reset(void){ this->l_data = 0; this->isGood = false; }
+ inline const U32& size(void) const{ return(this->l_data); }
inline const U32& capacity(void) const{ return(this->limit); }
inline U64 sizeBody(void) const{ return(this->body->l_shared + this->body->l_indiv); }
+ /**<
+ * Support function:
+ * Checks that there is exactly two alleles and that both the
+ * ref and alt allele are of length one (i.e. is a simple SNV->SNV)
+ * @return Returns TRUE if fulfilling these critera or FALSE otherwise
+ */
inline const bool isSimple(void) const{
- return((this->body->n_allele == 2) && (this->alleles[0].length == 1 && this->alleles[1].length == 1));
+ return((this->body->n_allele == 2) &&
+ (this->alleles[0].length == 1 &&
+ this->alleles[1].length == 1));
}
void __parseID(U32& internal_pos);
@@ -98,12 +107,12 @@ struct BCFEntry{
const bool& good(void) const{ return(this->isGood); }
public:
- U32 pointer; // byte width
- U32 limit; // capacity
- U32 l_ID;
- U32 p_genotypes; // position genotype data begin
- BYTE ref_alt; // parsed
- bool isGood;
+ U32 l_data; // byte width
+ U32 limit; // capacity
+ U32 l_ID;
+ U32 p_genotypes; // position genotype data begin
+ BYTE ref_alt; // parsed
+ bool isGood;
char* data; // hard copy data to buffer, interpret internally
body_type* body; // BCF2 body
string_type* alleles; // pointer to pointer of ref alleles and their lengths
diff --git a/src/io/bcf/BCFReader.cpp b/src/io/bcf/BCFReader.cpp
index ec9a978..984effe 100644
--- a/src/io/bcf/BCFReader.cpp
+++ b/src/io/bcf/BCFReader.cpp
@@ -85,7 +85,7 @@ bool BCFReader::parseHeader(void){
return false;
}
- if(strncmp(&this->bgzf_controller.buffer.data[0], "BCF\2\2", 5) != 0){
+ if(strncmp(this->bgzf_controller.buffer.data(), "BCF\2\2", 5) != 0){
std::cerr << Tomahawk::Helpers::timestamp("ERROR","BCF") << "Failed to validate MAGIC" << std::endl;
return false;
}
diff --git a/src/io/bcf/BCFReader.h b/src/io/bcf/BCFReader.h
index 32ff238..481bce0 100644
--- a/src/io/bcf/BCFReader.h
+++ b/src/io/bcf/BCFReader.h
@@ -8,18 +8,19 @@
#include "../BasicBuffer.h"
#include "../compression/BGZFController.h"
#include "BCFEntry.h"
+#include "../vcf/VCFHeader.h"
namespace Tomahawk {
namespace BCF {
class BCFReader{
- typedef BCFReader self_type;
- typedef IO::BasicBuffer buffer_type;
- typedef IO::BGZFController bgzf_controller_type;
- typedef IO::BGZFHeader bgzf_type;
- typedef VCF::VCFHeader header_type;
+ typedef BCFReader self_type;
+ typedef IO::BasicBuffer buffer_type;
+ typedef IO::BGZFController bgzf_controller_type;
+ typedef IO::BGZFHeader bgzf_type;
+ typedef VCF::VCFHeader header_type;
typedef VCF::VCFHeaderContig contig_type;
- typedef BCFEntry entry_type;
+ typedef BCFEntry entry_type;
public:
BCFReader();
@@ -33,13 +34,13 @@ class BCFReader{
bool open(const std::string input);
public:
- std::ifstream stream;
- U64 filesize;
- U32 current_pointer;
- buffer_type buffer;
- buffer_type header_buffer;
+ std::ifstream stream;
+ U64 filesize;
+ U32 current_pointer;
+ buffer_type buffer;
+ buffer_type header_buffer;
bgzf_controller_type bgzf_controller;
- header_type header;
+ header_type header;
};
}
diff --git a/src/io/compression/BGZFController.cpp b/src/io/compression/BGZFController.cpp
index e1d0b0c..b33a3bf 100644
--- a/src/io/compression/BGZFController.cpp
+++ b/src/io/compression/BGZFController.cpp
@@ -19,7 +19,7 @@ BGZFController::~BGZFController(){ this->buffer.deleteAll(); }
void BGZFController::Clear(){ this->buffer.reset(); }
U32 BGZFController::InflateSize(buffer_type& input) const{
- const header_type& header = *reinterpret_cast(&input.data[0]);
+ const header_type& header = *reinterpret_cast(input.data());
if(!header.Validate()){
std::cerr << Helpers::timestamp("ERROR","BGZF") << "Invalid BGZF header" << std::endl;
std::cerr << Helpers::timestamp("DEBUG","BGZF") << "Output length: " << header.BSIZE << std::endl;
@@ -49,7 +49,7 @@ bool BGZFController::Inflate(buffer_type& input, buffer_type& output, const head
}
bool BGZFController::__Inflate(buffer_type& input, buffer_type& output, const header_type& header) const{
- const U32& uncompressedLength = *reinterpret_cast(&input.data[input.size() - sizeof(U32)]);
+ const U32& uncompressedLength = *reinterpret_cast(&input[input.size() - sizeof(U32)]);
if(output.size() + uncompressedLength >= output.capacity())
output.resize((output.size() + uncompressedLength) + 65536);
@@ -63,9 +63,9 @@ bool BGZFController::__Inflate(buffer_type& input, buffer_type& output, const he
z_stream zs;
zs.zalloc = NULL;
zs.zfree = NULL;
- zs.next_in = (Bytef*)&input.data[Constants::BGZF_BLOCK_HEADER_LENGTH];
+ zs.next_in = (Bytef*)&input[Constants::BGZF_BLOCK_HEADER_LENGTH];
zs.avail_in = (header.BSIZE + 1) - 16;
- zs.next_out = (Bytef*)&output.data[output.pointer];
+ zs.next_out = (Bytef*)&output[output.size()];
zs.avail_out = (U32)avail_out;
int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS);
@@ -94,16 +94,16 @@ bool BGZFController::__Inflate(buffer_type& input, buffer_type& output, const he
//if(zs.total_out == 0)
// std::cerr << Helpers::timestamp("LOG", "BGZF") << "Detected empty BGZF block" << std::endl;
- output.pointer += zs.total_out;
+ output.n_chars += zs.total_out;
return(true);
}
bool BGZFController::InflateBlock(std::ifstream& stream, buffer_type& input){
input.resize(sizeof(header_type));
- stream.read(&input.data[0], IO::Constants::BGZF_BLOCK_HEADER_LENGTH);
- const header_type* h = reinterpret_cast(&input.data[0]);
- input.pointer = IO::Constants::BGZF_BLOCK_HEADER_LENGTH;
+ stream.read(input.data(), IO::Constants::BGZF_BLOCK_HEADER_LENGTH);
+ const header_type* h = reinterpret_cast(input.data());
+ input.n_chars = IO::Constants::BGZF_BLOCK_HEADER_LENGTH;
if(!h->Validate()){
std::cerr << Tomahawk::Helpers::timestamp("ERROR", "BCF") << "Failed to validate!" << std::endl;
std::cerr << *h << std::endl;
@@ -114,16 +114,16 @@ bool BGZFController::InflateBlock(std::ifstream& stream, buffer_type& input){
// Recast because if buffer is resized then the pointer address is incorrect
// resulting in segfault
- h = reinterpret_cast(&input.data[0]);
+ h = reinterpret_cast(input.data());
- stream.read(&input.data[IO::Constants::BGZF_BLOCK_HEADER_LENGTH], (h->BSIZE + 1) - IO::Constants::BGZF_BLOCK_HEADER_LENGTH);
+ stream.read(&input[IO::Constants::BGZF_BLOCK_HEADER_LENGTH], (h->BSIZE + 1) - IO::Constants::BGZF_BLOCK_HEADER_LENGTH);
if(!stream.good()){
std::cerr << Tomahawk::Helpers::timestamp("ERROR", "BCF") << "Truncated file..." << std::endl;
return false;
}
- input.pointer = h->BSIZE + 1;
- const U32 uncompressed_size = *reinterpret_cast(&input[input.pointer - sizeof(U32)]);
+ input.n_chars = h->BSIZE + 1;
+ const U32 uncompressed_size = *reinterpret_cast(&input[input.size() - sizeof(U32)]);
this->buffer.resize(uncompressed_size + 1);
this->buffer.reset();
diff --git a/src/io/compression/BGZFController.h b/src/io/compression/BGZFController.h
index ff7a368..5082ec4 100644
--- a/src/io/compression/BGZFController.h
+++ b/src/io/compression/BGZFController.h
@@ -27,7 +27,7 @@ class BGZFController {
bool InflateBlock(std::ifstream& stream, buffer_type& input);
friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){
- stream.write(entry.buffer.data, entry.buffer.pointer);
+ stream.write(entry.buffer.data(), entry.buffer.size());
return stream;
}
diff --git a/src/io/compression/GZFConstants.h b/src/io/compression/GZFConstants.h
index 9395868..1cb4c62 100644
--- a/src/io/compression/GZFConstants.h
+++ b/src/io/compression/GZFConstants.h
@@ -1,7 +1,7 @@
#ifndef GZFCONSTANTS_H_
#define GZFCONSTANTS_H_
-#include "../../support/TypeDefinitions.h"
+#include "../../support/type_definitions.h"
namespace Tomahawk{
namespace IO{
@@ -22,11 +22,11 @@ const BYTE BGZF_ID1 = 66;
const BYTE BGZF_ID2 = 67;
const BYTE BGZF_LEN = 2;
-const SBYTE GZIP_WINDOW_BITS = -15;
-const SBYTE Z_DEFAULT_MEM_LEVEL = 8;
-const BYTE TGZF_BLOCK_HEADER_LENGTH = 20;
-const BYTE TGZF_BLOCK_FOOTER_LENGTH = 8;
-const BYTE BGZF_BLOCK_HEADER_LENGTH = 18;
+const SBYTE GZIP_WINDOW_BITS = -15;
+const SBYTE Z_DEFAULT_MEM_LEVEL = 8;
+const BYTE TGZF_BLOCK_HEADER_LENGTH = 20;
+const BYTE TGZF_BLOCK_FOOTER_LENGTH = 8;
+const BYTE BGZF_BLOCK_HEADER_LENGTH = 18;
}
}
diff --git a/src/io/compression/GZFHeader.h b/src/io/compression/GZFHeader.h
index 12de6b6..f4b4d61 100644
--- a/src/io/compression/GZFHeader.h
+++ b/src/io/compression/GZFHeader.h
@@ -6,8 +6,8 @@
namespace Tomahawk{
namespace IO{
-#pragma pack(1)
-struct __headerBase{
+#pragma pack(push, 1)
+struct __attribute__((packed, aligned(1))) __headerBase{
private:
typedef __headerBase self_type;
@@ -72,8 +72,7 @@ struct __headerBase{
block to 2^32 bytes and adds and an extra "BC" field in the gzip header which
records the size.
*/
-#pragma pack(1)
-struct TGZFHeader : public __headerBase{
+struct __attribute__((packed, aligned(1))) TGZFHeader : public __headerBase{
private:
typedef TGZFHeader self_type;
typedef __headerBase parent_type;
@@ -121,8 +120,7 @@ struct TGZFHeader : public __headerBase{
block to 2^16 bytes and adds and an extra "BC" field in the gzip header which
records the size.
*/
-#pragma pack(1)
-struct BGZFHeader : public __headerBase{
+struct __attribute__((packed, aligned(1))) BGZFHeader : public __headerBase{
private:
typedef BGZFHeader self_type;
typedef __headerBase parent_type;
@@ -157,6 +155,8 @@ struct BGZFHeader : public __headerBase{
}
};
+#pragma pack(pop)
+
}
}
diff --git a/src/io/compression/TGZFController.cpp b/src/io/compression/TGZFController.cpp
index 5351578..6e37d47 100644
--- a/src/io/compression/TGZFController.cpp
+++ b/src/io/compression/TGZFController.cpp
@@ -38,7 +38,7 @@ bool TGZFController::Inflate(buffer_type& input, buffer_type& output, const head
}
bool TGZFController::__Inflate(buffer_type& input, buffer_type& output, const header_type& header) const{
- const U32& uncompressedLength = *reinterpret_cast(&input.data[input.size() - sizeof(U32)]);
+ const U32& uncompressedLength = *reinterpret_cast(&input[input.size() - sizeof(U32)]);
if(output.size() + uncompressedLength >= output.capacity())
output.resize((output.size() + uncompressedLength) + 65536);
@@ -53,9 +53,9 @@ bool TGZFController::__Inflate(buffer_type& input, buffer_type& output, const he
z_stream zs;
zs.zalloc = NULL;
zs.zfree = NULL;
- zs.next_in = (Bytef*)&input.data[Constants::TGZF_BLOCK_HEADER_LENGTH];
+ zs.next_in = (Bytef*)&input[Constants::TGZF_BLOCK_HEADER_LENGTH];
zs.avail_in = (header.BSIZE + 1) - 16;
- zs.next_out = (Bytef*)&output.data[output.pointer];
+ zs.next_out = (Bytef*)&output[output.size()];
zs.avail_out = (U32)avail_out;
int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS);
@@ -84,7 +84,7 @@ bool TGZFController::__Inflate(buffer_type& input, buffer_type& output, const he
if(zs.total_out == 0)
std::cerr << Helpers::timestamp("LOG", "TGZF") << "Detected empty TGZF block" << std::endl;
- output.pointer += zs.total_out;
+ output.n_chars += zs.total_out;
return(true);
}
@@ -92,7 +92,7 @@ bool TGZFController::__Inflate(buffer_type& input, buffer_type& output, const he
bool TGZFController::Deflate(const buffer_type& buffer){
this->buffer.resize(buffer);
- memset(this->buffer.data, 0, Constants::TGZF_BLOCK_HEADER_LENGTH);
+ memset(this->buffer.data(), 0, Constants::TGZF_BLOCK_HEADER_LENGTH);
this->buffer[0] = Constants::GZIP_ID1;
this->buffer[1] = Constants::GZIP_ID2;
@@ -113,8 +113,8 @@ bool TGZFController::Deflate(const buffer_type& buffer){
z_stream zs;
zs.zalloc = NULL;
zs.zfree = NULL;
- zs.next_in = (Bytef*)buffer.data;
- zs.avail_in = buffer.pointer;
+ zs.next_in = (Bytef*)buffer.data();
+ zs.avail_in = buffer.size();
zs.next_out = (Bytef*)&this->buffer[Constants::TGZF_BLOCK_HEADER_LENGTH];
zs.avail_out = this->buffer.width -
Constants::TGZF_BLOCK_HEADER_LENGTH -
@@ -169,18 +169,18 @@ bool TGZFController::Deflate(const buffer_type& buffer){
//std::cerr << Helpers::timestamp("DEBUG") << "Time: " << *time << std::endl;
- memset(&buffer.data[compressedLength - Constants::TGZF_BLOCK_FOOTER_LENGTH], 0, Constants::TGZF_BLOCK_FOOTER_LENGTH);
+ memset(&buffer.buffer[compressedLength - Constants::TGZF_BLOCK_FOOTER_LENGTH], 0, Constants::TGZF_BLOCK_FOOTER_LENGTH);
// store the CRC32 checksum
U32 crc = crc32(0, NULL, 0);
- crc = crc32(crc, (Bytef*)buffer.data, buffer.pointer);
+ crc = crc32(crc, (Bytef*)buffer.data(), buffer.size());
U32* c = reinterpret_cast(&this->buffer[compressedLength - Constants::TGZF_BLOCK_FOOTER_LENGTH]);
*c = crc;
- U32 convert = buffer.pointer; // avoid potential problems when casting from U64 to U32 by interpretation
+ U32 convert = buffer.size(); // avoid potential problems when casting from U64 to U32 by interpretation
U32* uncompressed = reinterpret_cast(&this->buffer[compressedLength - sizeof(U32)]);
*uncompressed = convert; // Store uncompressed length
- this->buffer.pointer = compressedLength;
+ this->buffer.n_chars = compressedLength;
//std::cerr << "Writing: " << convert << '/' << *uncompressed << '\t' << compressedLength << '\t' << *test << '\t' << buffer.size() << '\t' << "At pos: " << (compressedLength - sizeof(U32)) << '\t' << buffer.pointer << '\t' << *c << '\t' << convert << std::endl;
return true;
@@ -191,11 +191,11 @@ bool TGZFController::Deflate(buffer_type& meta, buffer_type& rle){
return(this->Deflate(meta));
}
-bool TGZFController::InflateBlock(std::ifstream& stream, buffer_type& input){
+bool TGZFController::InflateBlock(std::istream& stream, buffer_type& input){
input.resize(sizeof(header_type));
- stream.read(&input.data[0], IO::Constants::TGZF_BLOCK_HEADER_LENGTH);
- const header_type* h = reinterpret_cast(&input.data[0]);
- input.pointer = IO::Constants::TGZF_BLOCK_HEADER_LENGTH;
+ stream.read(input.data(), IO::Constants::TGZF_BLOCK_HEADER_LENGTH);
+ const header_type* h = reinterpret_cast(input.data());
+ input.n_chars = IO::Constants::TGZF_BLOCK_HEADER_LENGTH;
if(!h->Validate()){
std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TGZF") << "Failed to validate!" << std::endl;
std::cerr << *h << std::endl;
@@ -206,16 +206,16 @@ bool TGZFController::InflateBlock(std::ifstream& stream, buffer_type& input){
// Recast because if buffer is resized then the pointer address is incorrect
// resulting in segfault
- h = reinterpret_cast(&input.data[0]);
+ h = reinterpret_cast(input.data());
- stream.read(&input.data[IO::Constants::TGZF_BLOCK_HEADER_LENGTH], h->BSIZE - IO::Constants::TGZF_BLOCK_HEADER_LENGTH);
+ stream.read(&input[IO::Constants::TGZF_BLOCK_HEADER_LENGTH], h->BSIZE - IO::Constants::TGZF_BLOCK_HEADER_LENGTH);
if(!stream.good()){
std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TGZF") << "Truncated file..." << std::endl;
return false;
}
- input.pointer = h->BSIZE;
- const U32 uncompressed_size = *reinterpret_cast(&input[input.pointer - sizeof(U32)]);
+ input.n_chars = h->BSIZE;
+ const U32 uncompressed_size = *reinterpret_cast(&input[input.size() - sizeof(U32)]);
this->buffer.resize(uncompressed_size);
this->buffer.reset();
diff --git a/src/io/compression/TGZFController.h b/src/io/compression/TGZFController.h
index bc9dd03..1b31f14 100644
--- a/src/io/compression/TGZFController.h
+++ b/src/io/compression/TGZFController.h
@@ -29,13 +29,13 @@ class TGZFController{
void Clear();
bool Inflate(buffer_type& input, buffer_type& output, const header_type& header) const;
bool Inflate(buffer_type& input, buffer_type& output) const;
- bool InflateBlock(std::ifstream& stream, buffer_type& input);
+ bool InflateBlock(std::istream& stream, buffer_type& input);
bool Deflate(const buffer_type& buffer);
bool Deflate(buffer_type& meta, buffer_type& rle);
friend std::ostream& operator<<(std::ostream& stream, const self_type& entry){
- stream.write(entry.buffer.data, entry.buffer.pointer);
+ stream.write(entry.buffer.data(), entry.buffer.size());
return stream;
}
diff --git a/src/io/compression/TGZFControllerStream.cpp b/src/io/compression/TGZFControllerStream.cpp
index 4c04aeb..817f96c 100644
--- a/src/io/compression/TGZFControllerStream.cpp
+++ b/src/io/compression/TGZFControllerStream.cpp
@@ -13,8 +13,8 @@ bool TGZFControllerStream::InflateOpen(std::ifstream& stream){
this->buffer.reset();
this->buffer.resize(this->chunk_size);
this->bytes_read = 0;
- stream.read(&this->buffer.data[0], IO::Constants::TGZF_BLOCK_HEADER_LENGTH);
- const header_type* h = reinterpret_cast(&this->buffer.data[0]);
+ stream.read(this->buffer.data(), IO::Constants::TGZF_BLOCK_HEADER_LENGTH);
+ const header_type* h = reinterpret_cast(this->buffer.data());
if(!h->Validate()){
std::cerr << Tomahawk::Helpers::timestamp("ERROR", "TGZF") << "Failed to validate!" << std::endl;
@@ -80,19 +80,19 @@ bool TGZFControllerStream::__Inflate(std::ifstream& stream, const BYTE* output,
if(this->bytes_read + this->chunk_size > this->BSIZE)
read_amount = this->BSIZE - this->bytes_read;
- stream.read(&this->buffer.data[0], read_amount);
+ stream.read(this->buffer.data(), read_amount);
size_t total = stream.gcount();
this->bytes_read += total;
//std::cerr << "READ: " << total << "\t" << stream.tellg() << std::endl;
this->d_stream.avail_in = total;
- this->d_stream.next_in = (Bytef*)&this->buffer.data[0];
+ this->d_stream.next_in = (Bytef*)this->buffer.data();
if(total == 0){
std::cerr << Helpers::timestamp("WARNING","TGZF") << "Nothing read!" << std::endl;
return false;
}
- this->buffer.pointer = total;
+ this->buffer.n_chars = total;
}
const U32 tot_out = this->d_stream.total_out;
diff --git a/src/io/compression/TGZFEntryIterator.h b/src/io/compression/TGZFEntryIterator.h
index cc04380..0a217cf 100644
--- a/src/io/compression/TGZFEntryIterator.h
+++ b/src/io/compression/TGZFEntryIterator.h
@@ -74,7 +74,7 @@ bool TGZFEntryIterator::nextEntry(const T*& entry){
}
U32 ret_size = 0;
- if(!parent_type::Inflate(this->stream, (BYTE*)&output_buffer.data[0], this->chunk_size, ret_size)){
+ if(!parent_type::Inflate(this->stream, (BYTE*)output_buffer.data(), this->chunk_size, ret_size)){
if(this->STATE != TGZF_STATE::TGZF_END){
std::cerr << Helpers::timestamp("ERROR","TGZF") << "Invalid state (" << this->STATE << ")" << std::endl;
exit(1);
@@ -97,7 +97,7 @@ bool TGZFEntryIterator::nextEntry(const T*& entry){
this->reset(); // reset state
}
- if(!parent_type::Inflate(this->stream, (BYTE*)&output_buffer.data[0], this->chunk_size, ret_size)){
+ if(!parent_type::Inflate(this->stream, (BYTE*)output_buffer.data(), this->chunk_size, ret_size)){
if(this->STATE != TGZF_STATE::TGZF_END){
std::cerr << Helpers::timestamp("ERROR","TGZF") << "Invalid state (" << this->STATE << ")" << std::endl;
exit(1);
@@ -111,10 +111,10 @@ bool TGZFEntryIterator::nextEntry(const T*& entry){
}
- this->output_buffer.pointer = ret_size;
- this->n_entries = ret_size / sizeof(T);
- this->pointer = 0;
- this->entries = reinterpret_cast(this->output_buffer.data);
+ this->output_buffer.n_chars = ret_size;
+ this->n_entries = ret_size / sizeof(T);
+ this->pointer = 0;
+ this->entries = reinterpret_cast(this->output_buffer.data());
}
entry = &this->entries[this->pointer++];
diff --git a/src/io/output_writer.cpp b/src/io/output_writer.cpp
new file mode 100644
index 0000000..ebd2643
--- /dev/null
+++ b/src/io/output_writer.cpp
@@ -0,0 +1,187 @@
+#include "output_writer.h"
+
+namespace Tomahawk{
+namespace IO{
+
+OutputWriter::OutputWriter(void) :
+ owns_pointers(true),
+ writing_sorted_(false),
+ writing_sorted_partial_(false),
+ n_entries(0),
+ n_progress_count(0),
+ n_blocks(0),
+ l_flush_limit(2000000),
+ l_largest_uncompressed(0),
+ stream(nullptr),
+ buffer(this->l_flush_limit*2),
+ spin_lock(new spin_lock_type),
+ index_(new index_type),
+ footer_(new footer_type)
+{
+
+}
+
+OutputWriter::OutputWriter(std::string input_file) :
+ owns_pointers(true),
+ writing_sorted_(false),
+ writing_sorted_partial_(false),
+ n_entries(0),
+ n_progress_count(0),
+ n_blocks(0),
+ l_flush_limit(2000000),
+ l_largest_uncompressed(0),
+ stream(new std::ofstream(input_file, std::ios::binary | std::ios::out)),
+ buffer(this->l_flush_limit*2),
+ spin_lock(new spin_lock_type),
+ index_(new index_type),
+ footer_(new footer_type)
+{
+
+}
+
+OutputWriter::OutputWriter(const self_type& other) :
+ owns_pointers(false),
+ writing_sorted_(other.writing_sorted_),
+ writing_sorted_partial_(other.writing_sorted_partial_),
+ n_entries(other.n_entries),
+ n_progress_count(other.n_progress_count),
+ n_blocks(other.n_blocks),
+ l_flush_limit(other.l_flush_limit),
+ l_largest_uncompressed(0),
+ stream(other.stream),
+ buffer(other.buffer.capacity()),
+ spin_lock(other.spin_lock),
+ index_(other.index_),
+ footer_(other.footer_)
+{
+
+}
+
+OutputWriter::~OutputWriter(void){
+ if(this->owns_pointers){
+ this->stream->flush();
+ this->stream->close();
+ delete this->stream;
+ delete this->spin_lock;
+ delete this->index_;
+ delete this->footer_;
+ }
+}
+
+bool OutputWriter::open(const std::string& output_file){
+ if(output_file.size() == 0)
+ return false;
+
+ this->CheckOutputNames(output_file);
+ this->filename = output_file;
+
+ this->stream = new std::ofstream(this->basePath + this->baseName + '.' + Tomahawk::Constants::OUTPUT_LD_SUFFIX, std::ios::binary | std::ios::out);
+ if(this->stream->good() == false){
+ std::cerr << "Failed to open: " << output_file << std::endl;
+ return false;
+ }
+
+ return true;
+}
+
+int OutputWriter::writeHeaders(twk_header_type& twk_header){
+ const std::string command = "##tomahawk_calcCommand=" + Helpers::program_string();
+ twk_header.getLiterals() += command;
+ // Set file type to TWO
+ twk_header.magic_.file_type = 1;
+
+ return(twk_header.write(*this->stream));
+}
+
+void OutputWriter::writeFinal(void){
+ this->footer_->l_largest_uncompressed = this->l_largest_uncompressed;
+ this->footer_->offset_end_of_data = this->stream->tellp();
+ this->index_->setSorted(this->isSorted());
+ this->index_->setPartialSorted(this->isPartialSorted());
+
+ this->stream->flush();
+ *this->stream << *this->index_;
+ *this->stream << *this->footer_;
+ this->stream->flush();
+}
+
+void OutputWriter::flush(void){
+ if(this->buffer.size() > 0){
+ if(!this->compressor.Deflate(this->buffer)){
+ std::cerr << Helpers::timestamp("ERROR","TGZF") << "Failed deflate DATA..." << std::endl;
+ exit(1);
+ }
+
+ if(this->buffer.size() > l_largest_uncompressed)
+ this->l_largest_uncompressed = this->buffer.size();
+
+ this->spin_lock->lock();
+ this->index_entry.byte_offset = (U64)this->stream->tellp();
+ this->index_entry.uncompressed_size = this->buffer.size();
+ this->stream->write(this->compressor.buffer.data(), this->compressor.buffer.size());
+ this->index_entry.byte_offset_end = (U64)this->stream->tellp();
+ this->index_entry.n_variants = this->buffer.size() / sizeof(entry_type);
+ //*this->stream << this->index_entry;
+ this->index_->getContainer() += this->index_entry;
+ //std::cerr << this->index_entry.byte_offset_from << "->" << this->index_entry.byte_offset_to << " for " << this->index_entry.n_entries << " of " << this->index_entry.uncompressed_size << std::endl;
+ ++this->n_blocks;
+
+ this->spin_lock->unlock();
+
+ this->buffer.reset();
+ this->compressor.Clear();
+ this->index_entry.reset();
+ }
+}
+
+void OutputWriter::operator<<(const container_type& container){
+ for(size_type i = 0; i < container.size(); ++i)
+ this->buffer << container[i];
+
+ this->n_entries += buffer.size() / sizeof(entry_type);
+ *this << this->buffer;
+}
+
+void OutputWriter::operator<<(buffer_type& buffer){
+ if(buffer.size() > 0){
+ if(!this->compressor.Deflate(buffer)){
+ std::cerr << Helpers::timestamp("ERROR","TGZF") << "Failed deflate DATA..." << std::endl;
+ exit(1);
+ }
+
+ if(buffer.size() > l_largest_uncompressed)
+ this->l_largest_uncompressed = buffer.size();
+
+ // Lock
+ this->spin_lock->lock();
+
+ this->index_entry.byte_offset = (U64)this->stream->tellp();
+ this->index_entry.uncompressed_size = buffer.size();
+ this->stream->write(this->compressor.buffer.data(), this->compressor.buffer.size());
+ this->index_entry.byte_offset_end = (U64)this->stream->tellp();
+ this->index_entry.n_variants = buffer.size() / sizeof(entry_type);
+ this->index_->getContainer() += this->index_entry;
+ ++this->n_blocks;
+
+ // Unlock
+ this->spin_lock->unlock();
+
+ buffer.reset();
+ this->compressor.Clear();
+ this->index_entry.reset();
+ }
+}
+
+void OutputWriter::CheckOutputNames(const std::string& input){
+ std::vector